diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index dae88cf40f420d..72ecebca781f4c 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2178,6 +2178,13 @@ public class Config extends ConfigBase { }) public static int autobucket_min_buckets = 1; + @ConfField + public static int full_auto_analyze_simultaneously_running_task_num = 1; + + @ConfField + public static final int period_analyze_simultaneously_running_task_num = 1; + + @ConfField(mutable = true, description = { "Doris 为了兼用 mysql 周边工具生态,会内置一个名为 mysql 的数据库,如果该数据库与用户自建数据库冲突," + "请修改这个字段,为 doris 内置的 mysql database 更换一个名字", diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 74896554b7e9ee..6cc9a54a51f4af 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -4140,13 +4140,17 @@ show_param ::= RESULT = new ShowCreateMaterializedViewStmt(mvName, tableName); :} /* show analyze job */ - | KW_ANALYZE opt_table_name:tbl opt_wild_where order_by_clause:orderByClause limit_clause:limitClause + | KW_ANALYZE opt_table_name:tbl opt_wild_where {: - RESULT = new ShowAnalyzeStmt(tbl, parser.where, orderByClause, limitClause); + RESULT = new ShowAnalyzeStmt(tbl, parser.where, false); :} - | KW_ANALYZE INTEGER_LITERAL:jobId opt_wild_where order_by_clause:orderByClause limit_clause:limitClause + | KW_ANALYZE INTEGER_LITERAL:jobId opt_wild_where {: - RESULT = new ShowAnalyzeStmt(jobId, parser.where, orderByClause, limitClause); + RESULT = new ShowAnalyzeStmt(jobId, parser.where); + :} + | KW_AUTO KW_ANALYZE opt_table_name:tbl opt_wild_where + {: + RESULT = new ShowAnalyzeStmt(tbl, parser.where, true); :} | KW_ANALYZE KW_TASK KW_STATUS INTEGER_LITERAL:jobId {: diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java index ccb122bc26986c..d7e639da3a5bec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java @@ -22,16 +22,18 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import com.google.common.collect.ImmutableSet; +import com.google.gson.annotations.SerializedName; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.core.util.CronExpression; +import java.util.HashMap; import java.util.Map; import java.util.Optional; import java.util.concurrent.TimeUnit; +// TODO: Remove map public class AnalyzeProperties { - private final Map properties; - public static final String PROPERTY_SYNC = "sync"; public static final String PROPERTY_INCREMENTAL = "incremental"; public static final String PROPERTY_AUTOMATIC = "automatic"; @@ -41,6 +43,23 @@ public class AnalyzeProperties { public static final String PROPERTY_ANALYSIS_TYPE = "analysis.type"; public static final String PROPERTY_PERIOD_SECONDS = "period.seconds"; + public static final String PROPERTY_FORCE_FULL = "force.full"; + + public static final AnalyzeProperties DEFAULT_PROP = new AnalyzeProperties(new HashMap() { + { + put(AnalyzeProperties.PROPERTY_SYNC, "false"); + put(AnalyzeProperties.PROPERTY_AUTOMATIC, "false"); + put(AnalyzeProperties.PROPERTY_ANALYSIS_TYPE, AnalysisType.FUNDAMENTALS.toString()); + } + }); + + public static final String PROPERTY_PERIOD_CRON = "period.cron"; + + private CronExpression cronExpression; + + @SerializedName("analyzeProperties") + private final Map properties; + private static final ImmutableSet PROPERTIES_SET = new ImmutableSet.Builder() .add(PROPERTY_SYNC) .add(PROPERTY_INCREMENTAL) @@ -50,6 +69,8 @@ public class AnalyzeProperties { .add(PROPERTY_NUM_BUCKETS) .add(PROPERTY_ANALYSIS_TYPE) .add(PROPERTY_PERIOD_SECONDS) + .add(PROPERTY_PERIOD_CRON) + .add(PROPERTY_FORCE_FULL) .build(); public AnalyzeProperties(Map properties) { @@ -72,6 +93,7 @@ public void check() throws AnalysisException { checkAnalysisMode(msgTemplate); checkAnalysisType(msgTemplate); checkScheduleType(msgTemplate); + checkPeriod(); } public boolean isSync() { @@ -115,6 +137,10 @@ public long getPeriodTimeInMs() { return TimeUnit.SECONDS.toMillis(minutes); } + public CronExpression getCron() { + return cronExpression; + } + private void checkPeriodSeconds() throws AnalysisException { if (properties.containsKey(PROPERTY_PERIOD_SECONDS)) { checkNumericProperty(PROPERTY_PERIOD_SECONDS, properties.get(PROPERTY_PERIOD_SECONDS), @@ -207,6 +233,22 @@ private void checkScheduleType(String msgTemplate) throws AnalysisException { } } + private void checkPeriod() throws AnalysisException { + if (properties.containsKey(PROPERTY_PERIOD_SECONDS) + && properties.containsKey(PROPERTY_PERIOD_CRON)) { + throw new AnalysisException(PROPERTY_PERIOD_SECONDS + " and " + PROPERTY_PERIOD_CRON + + " couldn't be set simultaneously"); + } + String cronExprStr = properties.get(PROPERTY_PERIOD_CRON); + if (cronExprStr != null) { + try { + cronExpression = new CronExpression(cronExprStr); + } catch (java.text.ParseException e) { + throw new AnalysisException("Invalid cron expression: " + cronExprStr); + } + } + } + private void checkNumericProperty(String key, String value, int lowerBound, int upperBound, boolean includeBoundary, String errorMsg) throws AnalysisException { if (!StringUtils.isNumeric(value)) { @@ -226,6 +268,14 @@ public boolean isSample() { || properties.containsKey(PROPERTY_SAMPLE_ROWS); } + public boolean forceFull() { + return properties.containsKey(PROPERTY_FORCE_FULL); + } + + public boolean isSampleRows() { + return properties.containsKey(PROPERTY_SAMPLE_ROWS); + } + public String toSQL() { StringBuilder sb = new StringBuilder(); sb.append("PROPERTIES("); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java index 6f1f7c64d8f84c..ae2c6a7ff4830f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java @@ -23,6 +23,8 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.logging.log4j.core.util.CronExpression; + import java.util.Map; public class AnalyzeStmt extends StatementBase { @@ -55,7 +57,8 @@ public ScheduleType getScheduleType() { if (analyzeProperties.isAutomatic()) { return ScheduleType.AUTOMATIC; } - return analyzeProperties.getPeriodTimeInMs() > 0 ? ScheduleType.PERIOD : ScheduleType.ONCE; + return analyzeProperties.getPeriodTimeInMs() > 0 || analyzeProperties.getCron() != null + ? ScheduleType.PERIOD : ScheduleType.ONCE; } public boolean isSync() { @@ -86,4 +89,12 @@ public AnalyzeProperties getAnalyzeProperties() { public RedirectStatus getRedirectStatus() { return RedirectStatus.FORWARD_WITH_SYNC; } + + public CronExpression getCron() { + return analyzeProperties.getCron(); + } + + public boolean forceFull() { + return analyzeProperties.forceFull(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java index 920b60627a8653..cbc66f367f260f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java @@ -24,6 +24,7 @@ import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; +import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; @@ -41,6 +42,7 @@ import com.google.common.collect.Sets; import org.apache.commons.lang3.StringUtils; +import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.Set; @@ -84,7 +86,7 @@ public class AnalyzeTblStmt extends AnalyzeStmt { private final TableName tableName; private List columnNames; - private List partitionNames; + private PartitionNames partitionNames; private boolean isAllColumns; // after analyzed @@ -97,7 +99,7 @@ public AnalyzeTblStmt(TableName tableName, AnalyzeProperties properties) { super(properties); this.tableName = tableName; - this.partitionNames = partitionNames == null ? null : partitionNames.getPartitionNames(); + this.partitionNames = partitionNames; this.columnNames = columnNames; this.analyzeProperties = properties; this.isAllColumns = columnNames == null; @@ -166,11 +168,9 @@ public void check() throws AnalysisException { analyzeProperties.check(); // TODO support external table - if (analyzeProperties.isSample()) { - if (!(table instanceof OlapTable)) { - throw new AnalysisException("Sampling statistics " - + "collection of external tables is not supported"); - } + if (analyzeProperties.isSampleRows() && !(table instanceof OlapTable)) { + throw new AnalysisException("Sampling statistics " + + "collection of external tables is not supported with rows, use percent instead."); } if (analyzeProperties.isSync() && (analyzeProperties.isAutomatic() || analyzeProperties.getPeriodTimeInMs() != 0)) { @@ -181,6 +181,9 @@ public void check() throws AnalysisException { throw new AnalysisException("Automatic collection " + "and period statistics collection cannot be set at same time"); } + if (analyzeProperties.isSample() && analyzeProperties.forceFull()) { + throw new AnalysisException("Impossible to analyze with sample and full simultaneously"); + } } private void checkColumn() throws AnalysisException { @@ -196,7 +199,8 @@ private void checkColumn() throws AnalysisException { } } if (containsUnsupportedTytpe) { - if (!ConnectContext.get().getSessionVariable().enableAnalyzeComplexTypeColumn) { + if (ConnectContext.get() == null + || !ConnectContext.get().getSessionVariable().enableAnalyzeComplexTypeColumn) { columnNames = columnNames.stream() .filter(c -> !StatisticsUtil.isUnsupportedType(table.getColumn(c).getType())) .collect(Collectors.toList()); @@ -236,14 +240,33 @@ public Set getColumnNames() { } public Set getPartitionNames() { - Set partitions = partitionNames == null ? table.getPartitionNames() : Sets.newHashSet(partitionNames); - if (isSamplingPartition()) { - int partNum = ConnectContext.get().getSessionVariable().getExternalTableAnalyzePartNum(); - partitions = partitions.stream().limit(partNum).collect(Collectors.toSet()); + if (partitionNames == null || partitionNames.getPartitionNames() == null) { + if (table instanceof ExternalTable) { + // External table couldn't return all partitions when partitionNames is not set. + // Because Analyze Table command for external table could specify partition names. + return Collections.emptySet(); + } + return table.getPartitionNames(); } + Set partitions = Sets.newHashSet(); + partitions.addAll(partitionNames.getPartitionNames()); return partitions; } + public boolean isAllPartitions() { + if (partitionNames == null) { + return false; + } + return partitionNames.isAllPartitions(); + } + + public long getPartitionCount() { + if (partitionNames == null) { + return 0; + } + return partitionNames.getCount(); + } + public boolean isPartitionOnly() { return partitionNames != null; } @@ -260,8 +283,13 @@ public boolean isSamplingPartition() { } private void checkAnalyzePriv(String dbName, String tblName) throws AnalysisException { + ConnectContext ctx = ConnectContext.get(); + // means it a system analyze + if (ctx == null) { + return; + } if (!Env.getCurrentEnv().getAccessManager() - .checkTblPriv(ConnectContext.get(), dbName, tblName, PrivPredicate.SELECT)) { + .checkTblPriv(ctx, dbName, tblName, PrivPredicate.SELECT)) { ErrorReport.reportAnalysisException( ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "ANALYZE", diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java index 1140dfc6777641..ca26a2978e0e54 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java @@ -48,15 +48,37 @@ public class PartitionNames implements ParseNode, Writable { // true if these partitions are temp partitions @SerializedName(value = "isTemp") private final boolean isTemp; + private final boolean allPartitions; + private final long count; + // Default partition count to collect statistic for external table. + private static final long DEFAULT_PARTITION_COUNT = 100; public PartitionNames(boolean isTemp, List partitionNames) { this.partitionNames = partitionNames; this.isTemp = isTemp; + this.allPartitions = false; + this.count = 0; } public PartitionNames(PartitionNames other) { this.partitionNames = Lists.newArrayList(other.partitionNames); this.isTemp = other.isTemp; + this.allPartitions = other.allPartitions; + this.count = 0; + } + + public PartitionNames(boolean allPartitions) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = allPartitions; + this.count = 0; + } + + public PartitionNames(long partitionCount) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = false; + this.count = partitionCount; } public List getPartitionNames() { @@ -67,9 +89,23 @@ public boolean isTemp() { return isTemp; } + public boolean isAllPartitions() { + return allPartitions; + } + + public long getCount() { + return count; + } + @Override public void analyze(Analyzer analyzer) throws AnalysisException { - if (partitionNames.isEmpty()) { + if (allPartitions && count > 0) { + throw new AnalysisException("All partition and partition count couldn't be set at the same time."); + } + if (allPartitions || count > 0) { + return; + } + if (partitionNames == null || partitionNames.isEmpty()) { throw new AnalysisException("No partition specified in partition lists"); } // check if partition name is not empty string diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java index 95035641a7e86d..fb19cb2fd5bf95 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java @@ -25,7 +25,6 @@ import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.UserException; -import org.apache.doris.common.util.OrderByPair; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSetMetaData; @@ -35,10 +34,6 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.IntStream; - /** * ShowAnalyzeStmt is used to show statistics job info. * syntax: @@ -69,36 +64,30 @@ public class ShowAnalyzeStmt extends ShowStmt { .build(); private long jobId; - private TableName dbTableName; - private Expr whereClause; - private LimitElement limitElement; - private List orderByElements; + private final TableName dbTableName; + private final Expr whereClause; + + // extract from predicate private String stateValue; - private ArrayList orderByPairs; - public ShowAnalyzeStmt() { - } + private final boolean auto; + public ShowAnalyzeStmt(TableName dbTableName, - Expr whereClause, - List orderByElements, - LimitElement limitElement) { + Expr whereClause, boolean auto) { this.dbTableName = dbTableName; this.whereClause = whereClause; - this.orderByElements = orderByElements; - this.limitElement = limitElement; + this.auto = auto; + } public ShowAnalyzeStmt(long jobId, - Expr whereClause, - List orderByElements, - LimitElement limitElement) { + Expr whereClause) { Preconditions.checkArgument(jobId > 0, "JobId must greater than 0."); this.jobId = jobId; this.dbTableName = null; this.whereClause = whereClause; - this.orderByElements = orderByElements; - this.limitElement = limitElement; + this.auto = false; } public long getJobId() { @@ -111,12 +100,6 @@ public String getStateValue() { return stateValue; } - public ArrayList getOrderByPairs() { - Preconditions.checkArgument(isAnalyzed(), - "The orderByPairs must be obtained after the parsing is complete"); - return orderByPairs; - } - public Expr getWhereClause() { Preconditions.checkArgument(isAnalyzed(), "The whereClause must be obtained after the parsing is complete"); @@ -124,13 +107,6 @@ public Expr getWhereClause() { return whereClause; } - public long getLimit() { - if (limitElement != null && limitElement.hasLimit()) { - return limitElement.getLimit(); - } - return -1L; - } - @Override public void analyze(Analyzer analyzer) throws UserException { if (!Config.enable_stats) { @@ -149,21 +125,6 @@ public void analyze(Analyzer analyzer) throws UserException { if (whereClause != null) { analyzeSubPredicate(whereClause); } - - // analyze order by - if (orderByElements != null && !orderByElements.isEmpty()) { - orderByPairs = new ArrayList<>(); - for (OrderByElement orderByElement : orderByElements) { - if (orderByElement.getExpr() instanceof SlotRef) { - SlotRef slotRef = (SlotRef) orderByElement.getExpr(); - int index = analyzeColumn(slotRef.getColumnName()); - OrderByPair orderByPair = new OrderByPair(index, !orderByElement.getIsAsc()); - orderByPairs.add(orderByPair); - } else { - throw new AnalysisException("Should order by column"); - } - } - } } @Override @@ -279,25 +240,6 @@ public String toSql() { sb.append(whereClause.toSql()); } - // Order By clause - if (orderByElements != null) { - sb.append(" "); - sb.append("ORDER BY"); - sb.append(" "); - IntStream.range(0, orderByElements.size()).forEach(i -> { - sb.append(orderByElements.get(i).getExpr().toSql()); - sb.append((orderByElements.get(i).getIsAsc()) ? " ASC" : " DESC"); - sb.append((i + 1 != orderByElements.size()) ? ", " : ""); - }); - } - - if (getLimit() != -1L) { - sb.append(" "); - sb.append("LIMIT"); - sb.append(" "); - sb.append(getLimit()); - } - return sb.toString(); } @@ -309,4 +251,8 @@ public String toString() { public TableName getDbTableName() { return dbTableName; } + + public boolean isAuto() { + return auto; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java index da10d5c492b1fe..fe499fa1b0849f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java @@ -32,12 +32,13 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSet; import org.apache.doris.qe.ShowResultSetMetaData; -import org.apache.doris.statistics.TableStatistic; -import org.apache.doris.statistics.util.StatisticsUtil; +import org.apache.doris.statistics.TableStatsMeta; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; +import java.sql.Date; +import java.util.ArrayList; import java.util.List; public class ShowTableStatsStmt extends ShowStmt { @@ -45,9 +46,12 @@ public class ShowTableStatsStmt extends ShowStmt { // TODO add more columns private static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() + .add("updated_rows") + .add("query_times") .add("row_count") - .add("update_time") - .add("last_analyze_time") + .add("updated_time") + .add("columns") + .add("trigger") .build(); private final TableName tableName; @@ -126,12 +130,33 @@ public long getPartitionId() { return table.getPartition(partitionName).getId(); } - public ShowResultSet constructResultSet(TableStatistic tableStatistic) { + public ShowResultSet constructResultSet(TableStatsMeta tableStatistic) { + if (tableStatistic == null) { + return new ShowResultSet(getMetaData(), new ArrayList<>()); + } List> result = Lists.newArrayList(); List row = Lists.newArrayList(); + row.add(String.valueOf(tableStatistic.updatedRows)); + row.add(String.valueOf(tableStatistic.queriedTimes.get())); row.add(String.valueOf(tableStatistic.rowCount)); - row.add(String.valueOf(tableStatistic.updateTime)); - row.add(StatisticsUtil.getReadableTime(tableStatistic.lastAnalyzeTimeInMs)); + row.add(new Date(tableStatistic.updatedTime).toString()); + row.add(tableStatistic.analyzeColumns().toString()); + row.add(tableStatistic.jobType.toString()); + result.add(row); + return new ShowResultSet(getMetaData(), result); + } + + public ShowResultSet constructResultSet(long rowCount) { + List> result = Lists.newArrayList(); + List row = Lists.newArrayList(); + row.add(""); + row.add(""); + row.add(String.valueOf(rowCount)); + row.add(""); + row.add(""); + row.add(""); + row.add(""); + row.add(""); result.add(row); return new ShowResultSet(getMetaData(), result); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 6748a8b4afc353..2b0f581375cf13 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -211,7 +211,7 @@ import org.apache.doris.resource.workloadgroup.WorkloadGroupMgr; import org.apache.doris.service.FrontendOptions; import org.apache.doris.statistics.AnalysisManager; -import org.apache.doris.statistics.StatisticsAutoAnalyzer; +import org.apache.doris.statistics.StatisticsAutoCollector; import org.apache.doris.statistics.StatisticsCache; import org.apache.doris.statistics.StatisticsCleaner; import org.apache.doris.statistics.query.QueryStats; @@ -457,7 +457,7 @@ public class Env { */ private final LoadManagerAdapter loadManagerAdapter; - private StatisticsAutoAnalyzer statisticsAutoAnalyzer; + private StatisticsAutoCollector statisticsAutoCollector; private HiveTransactionMgr hiveTransactionMgr; @@ -663,7 +663,7 @@ private Env(boolean isCheckpointCatalog) { this.extMetaCacheMgr = new ExternalMetaCacheMgr(); this.analysisManager = new AnalysisManager(); this.statisticsCleaner = new StatisticsCleaner(); - this.statisticsAutoAnalyzer = new StatisticsAutoAnalyzer(); + this.statisticsAutoCollector = new StatisticsAutoCollector(); this.globalFunctionMgr = new GlobalFunctionMgr(); this.workloadGroupMgr = new WorkloadGroupMgr(); this.queryStats = new QueryStats(); @@ -907,8 +907,8 @@ public void initialize(String[] args) throws Exception { if (statisticsCleaner != null) { statisticsCleaner.start(); } - if (statisticsAutoAnalyzer != null) { - statisticsAutoAnalyzer.start(); + if (statisticsAutoCollector != null) { + statisticsAutoCollector.start(); } } @@ -5420,8 +5420,8 @@ public LoadManagerAdapter getLoadManagerAdapter() { return loadManagerAdapter; } - public StatisticsAutoAnalyzer getStatisticsAutoAnalyzer() { - return statisticsAutoAnalyzer; + public StatisticsAutoCollector getStatisticsAutoCollector() { + return statisticsAutoCollector; } public QueryStats getQueryStats() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java index fd42de00f7834d..b7fdec73f02679 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java @@ -83,7 +83,6 @@ public void run() { return; } Database database = op.get(); - modifyTblReplicaCount(database, StatisticConstants.ANALYSIS_TBL_NAME); modifyTblReplicaCount(database, StatisticConstants.STATISTIC_TBL_NAME); modifyTblReplicaCount(database, StatisticConstants.HISTOGRAM_TBL_NAME); } @@ -126,7 +125,6 @@ public void modifyTblReplicaCount(Database database, String tblName) { } private void createTbl() throws UserException { - Env.getCurrentEnv().getInternalCatalog().createTable(buildAnalysisTblStmt()); Env.getCurrentEnv().getInternalCatalog().createTable(buildStatisticsTblStmt()); Env.getCurrentEnv().getInternalCatalog().createTable(buildHistogramTblStmt()); } @@ -145,41 +143,6 @@ public static void createDB() { } } - @VisibleForTesting - public CreateTableStmt buildAnalysisTblStmt() throws UserException { - TableName tableName = new TableName("", - FeConstants.INTERNAL_DB_NAME, StatisticConstants.ANALYSIS_TBL_NAME); - List columnDefs = new ArrayList<>(); - columnDefs.add(new ColumnDef("id", TypeDef.createVarchar(StatisticConstants.ID_LEN))); - columnDefs.add(new ColumnDef("catalog_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("db_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("tbl_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("idx_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - ColumnDef partId = new ColumnDef("part_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)); - partId.setAllowNull(true); - columnDefs.add(partId); - columnDefs.add(new ColumnDef("count", TypeDef.create(PrimitiveType.BIGINT))); - columnDefs.add(new ColumnDef("last_analyze_time_in_ms", TypeDef.create(PrimitiveType.BIGINT))); - columnDefs.add(new ColumnDef("update_time", TypeDef.create(PrimitiveType.DATETIME))); - String engineName = "olap"; - ArrayList uniqueKeys = Lists.newArrayList("id", "catalog_id", - "db_id", "tbl_id", "idx_id", "part_id"); - KeysDesc keysDesc = new KeysDesc(KeysType.UNIQUE_KEYS, uniqueKeys); - DistributionDesc distributionDesc = new HashDistributionDesc( - StatisticConstants.STATISTIC_TABLE_BUCKET_COUNT, uniqueKeys); - Map properties = new HashMap() { - { - put("replication_num", String.valueOf( - Math.max(1, Config.min_replication_num_per_tablet))); - } - }; - CreateTableStmt createTableStmt = new CreateTableStmt(true, false, - tableName, columnDefs, engineName, keysDesc, null, distributionDesc, - properties, null, "Doris internal statistics table, DO NOT MODIFY IT", null); - StatisticsUtil.analyze(createTableStmt); - return createTableStmt; - } - @VisibleForTesting public CreateTableStmt buildStatisticsTblStmt() throws UserException { TableName tableName = new TableName("", @@ -281,8 +244,7 @@ private boolean created() { } return false; } - return db.getTable(StatisticConstants.HISTOGRAM_TBL_NAME).isPresent() - && db.getTable(StatisticConstants.ANALYSIS_TBL_NAME).isPresent(); + return db.getTable(StatisticConstants.HISTOGRAM_TBL_NAME).isPresent(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 9975ba0230ac92..f59df2554d36b2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -55,6 +55,8 @@ import org.apache.doris.statistics.HistogramTask; import org.apache.doris.statistics.MVAnalysisTask; import org.apache.doris.statistics.OlapAnalysisTask; +import org.apache.doris.statistics.TableStatsMeta; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TColumn; @@ -2249,4 +2251,55 @@ public void analyze(String dbName) { } } } + + @Override + public Map> findReAnalyzeNeededPartitions() { + TableIf table = this; + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); + Set allPartitions = table.getPartitionNames().stream().map(table::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + if (tableStats == null) { + return table.getBaseSchema().stream().collect(Collectors.toMap(Column::getName, v -> allPartitions)); + } + Map> colToPart = new HashMap<>(); + for (Column col : table.getBaseSchema()) { + long lastUpdateTime = tableStats.findColumnLastUpdateTime(col.getName()); + Set partitions = table.getPartitionNames().stream() + .map(table::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= lastUpdateTime).map(Partition::getName) + .collect(Collectors.toSet()); + colToPart.put(col.getName(), partitions); + } + return colToPart; + } + + public long getDataSize(boolean singleReplica) { + long dataSize = 0; + for (Partition partition : getAllPartitions()) { + dataSize += partition.getDataSize(singleReplica); + } + return dataSize; + } + + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + if (tblStats == null) { + return true; + } + long rowCount = getRowCount(); + // TODO: Do we need to analyze an empty table? + if (rowCount == 0) { + return false; + } + if (!tblStats.analyzeColumns().containsAll(getBaseSchema() + .stream() + .map(Column::getName) + .collect(Collectors.toSet()))) { + return true; + } + long updateRows = tblStats.updatedRows.get(); + int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); + return tblHealth < Config.table_stats_health_threshold; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 0c50fc42b4b135..ba7e55c7d86629 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -30,6 +30,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.base.Preconditions; @@ -557,4 +558,14 @@ public Optional getColumnStatistic(String colName) { public void analyze(String dbName) { } + + @Override + public Map> findReAnalyzeNeededPartitions() { + return Collections.emptyMap(); + } + + @Override + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index 78717f0eca769d..108d227e591669 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -23,6 +23,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.collect.Lists; @@ -33,6 +34,7 @@ import java.io.IOException; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.TimeUnit; @@ -236,5 +238,16 @@ default boolean isManagedTable() { default long getLastUpdateTime() { return -1L; } + + Map> findReAnalyzeNeededPartitions(); + + default long getDataSize(boolean singleReplica) { + // TODO: Each tableIf should impl it by itself. + return 0; + } + + boolean needReAnalyzeTable(TableStatsMeta tblStats); + + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index c0de97969f9dd0..01b3ce9ee2d0b2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -35,8 +35,10 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import lombok.Getter; import org.apache.commons.lang3.NotImplementedException; @@ -46,10 +48,14 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.stream.Collectors; /** * External table represent tables that are not self-managed by Doris. @@ -376,4 +382,19 @@ public void gsonPostProcess() throws IOException { rwLock = new ReentrantReadWriteLock(true); objectCreated = false; } + + @Override + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + // TODO: Find a way to decide if this external table need to be reanalyzed. + // For now, simply return true for all external tables. + return true; + } + + @Override + public Map> findReAnalyzeNeededPartitions() { + HashSet partitions = Sets.newHashSet(); + // TODO: Find a way to collect external table partitions that need to be analyzed. + partitions.add("Dummy Partition"); + return getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k -> partitions)); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 1a11d890058fdb..728eec3e6f923d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -32,7 +32,7 @@ import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.HMSAnalysisTask; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.thrift.THiveTable; import org.apache.doris.thrift.TTableDescriptor; @@ -57,6 +57,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.time.LocalDate; @@ -102,16 +103,19 @@ public class HMSExternalTable extends ExternalTable { SUPPORTED_HUDI_FILE_FORMATS.add("com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat"); } - private volatile org.apache.hadoop.hive.metastore.api.Table remoteTable = null; - private List partitionColumns; + protected volatile org.apache.hadoop.hive.metastore.api.Table remoteTable = null; + protected List partitionColumns; - private DLAType dlaType = DLAType.UNKNOWN; + protected DLAType dlaType = DLAType.UNKNOWN; + + // No as precise as row count in TableStats, but better than none. + private long estimatedRowCount = -1; // record the partition update time when enable hms event listener protected volatile long partitionUpdateTime; public enum DLAType { - UNKNOWN, HIVE, HUDI, ICEBERG + UNKNOWN, HIVE, HUDI, ICEBERG, DELTALAKE } /** @@ -126,6 +130,10 @@ public HMSExternalTable(long id, String name, String dbName, HMSExternalCatalog super(id, name, catalog, dbName, TableType.HMS_EXTERNAL_TABLE); } + public HMSExternalTable(long id, String name, String dbName, HMSExternalCatalog catalog, TableType type) { + super(id, name, catalog, dbName, type); + } + public boolean isSupportedHmsTable() { makeSureInitialized(); return dlaType != DLAType.UNKNOWN; @@ -149,6 +157,7 @@ protected synchronized void makeSureInitialized() { } } objectCreated = true; + estimatedRowCount = getRowCountFromExternalSource(true); } } @@ -267,10 +276,19 @@ public long getCreateTime() { @Override public long getRowCount() { makeSureInitialized(); + long rowCount = getRowCountFromExternalSource(false); + if (rowCount == -1) { + LOG.debug("Will estimate row count from file list."); + rowCount = StatisticsUtil.getRowCountFromFileList(this); + } + return rowCount; + } + + private long getRowCountFromExternalSource(boolean isInit) { long rowCount; switch (dlaType) { case HIVE: - rowCount = StatisticsUtil.getHiveRowCount(this); + rowCount = StatisticsUtil.getHiveRowCount(this, isInit); break; case ICEBERG: rowCount = StatisticsUtil.getIcebergRowCount(this); @@ -279,10 +297,6 @@ public long getRowCount() { LOG.warn("getRowCount for dlaType {} is not supported.", dlaType); rowCount = -1; } - if (rowCount == -1) { - LOG.debug("Will estimate row count from file list."); - rowCount = StatisticsUtil.getRowCountFromFileList(this); - } return rowCount; } @@ -420,13 +434,20 @@ public List getHudiSchema(List hmsSchema) { @Override public long estimatedRowCount() { try { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - catalog.getId(), catalog.getDbOrAnalysisException(dbName).getId(), id); - if (tableStatistics.isPresent()) { - long rowCount = tableStatistics.get().rowCount; + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(id); + if (tableStats != null) { + long rowCount = tableStats.rowCount; LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); return rowCount; } + + if (estimatedRowCount != -1) { + return estimatedRowCount; + } + // Cache the estimated row count in this structure + // though the table never get analyzed, since the row estimation might be expensive caused by RPC. + estimatedRowCount = getRowCount(); + return estimatedRowCount; } catch (Exception e) { LOG.warn("Fail to get row count for table {}", name, e); } @@ -447,7 +468,7 @@ private List getIcebergSchema(List hmsSchema) { return tmpSchema; } - private void initPartitionColumns(List schema) { + protected void initPartitionColumns(List schema) { List partitionKeys = remoteTable.getPartitionKeys().stream().map(FieldSchema::getName) .collect(Collectors.toList()); partitionColumns = Lists.newArrayListWithCapacity(partitionKeys.size()); @@ -478,7 +499,7 @@ public Optional getColumnStatistic(String colName) { return getHiveColumnStats(colName); case ICEBERG: return StatisticsUtil.getIcebergColumnStats(colName, - Env.getCurrentEnv().getExtMetaCacheMgr().getIcebergMetadataCache().getIcebergTable(this)); + Env.getCurrentEnv().getExtMetaCacheMgr().getIcebergMetadataCache().getIcebergTable(this)); default: LOG.warn("get column stats for dlaType {} is not supported.", dlaType); } @@ -617,6 +638,12 @@ public void setPartitionUpdateTime(long updateTime) { public long getUpdateTime() { return Math.max(this.schemaUpdateTime, this.partitionUpdateTime); } + + @Override + public void gsonPostProcess() throws IOException { + super.gsonPostProcess(); + estimatedRowCount = -1; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java index 051bfa5e585d1d..a02c59080fc4eb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java @@ -24,14 +24,13 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.JdbcAnalysisTask; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.List; -import java.util.Optional; /** * Elasticsearch external table. @@ -112,16 +111,11 @@ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) { @Override public long getRowCount() { makeSureInitialized(); - try { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - catalog.getId(), catalog.getDbOrAnalysisException(dbName).getId(), id); - if (tableStatistics.isPresent()) { - long rowCount = tableStatistics.get().rowCount; - LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); - return rowCount; - } - } catch (Exception e) { - LOG.warn("Fail to get row count for table {}", name, e); + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(id); + if (tableStats != null) { + long rowCount = tableStats.rowCount; + LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); + return rowCount; } return 1; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java index be8731b6b25444..31d608b8a258a9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java @@ -134,6 +134,15 @@ public static ThreadPoolExecutor newDaemonFixedThreadPool(int numThread, int que poolName, needRegisterMetric); } + public static ThreadPoolExecutor newDaemonFixedThreadPool(int numThread, int queueSize, + String poolName, + boolean needRegisterMetric, + RejectedExecutionHandler handler) { + return newDaemonThreadPool(numThread, numThread, KEEP_ALIVE_TIME, TimeUnit.SECONDS, + new LinkedBlockingQueue<>(queueSize), handler, + poolName, needRegisterMetric); + } + public static ThreadPoolExecutor newDaemonFixedPriorityThreadPool(int numThread, int initQueueSize, Comparator comparator, Class tClass, String poolName, boolean needRegisterMetric) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java index 2d8925b5d98607..cebc526d14b62d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java @@ -175,4 +175,7 @@ default CatalogLog constructEditLog() { public Collection getAllDbs(); public ConcurrentHashMap getIdToDb(); + + public boolean enableAutoAnalyze(); + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java index 5dce5646fc402a..356c116ae426d6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java @@ -64,8 +64,10 @@ import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Function; @@ -1175,5 +1177,9 @@ public void gsonPostProcess() throws IOException { public Map getIdToCatalog() { return idToCatalog; } + + public Set getCopyOfCatalog() { + return new HashSet<>(idToCatalog.values()); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java index 499a1f9385f366..22a6816543e31b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java @@ -74,6 +74,8 @@ public abstract class ExternalCatalog implements CatalogIf>, Writable, GsonPostProcessable { private static final Logger LOG = LogManager.getLogger(ExternalCatalog.class); + public static final String ENABLE_AUTO_ANALYZE = "enable.auto.analyze"; + // Unique id of this catalog, will be assigned after catalog is loaded. @SerializedName(value = "id") protected long id; @@ -597,4 +599,18 @@ public Collection getAllDbs() { public ConcurrentHashMap getIdToDb() { return new ConcurrentHashMap<>(idToDb); } + + @Override + public boolean enableAutoAnalyze() { + // By default, external catalog disables auto analyze, uses could set catalog property to enable it: + // "enable.auto.analyze" = true + Map properties = catalogProperty.getProperties(); + boolean ret = false; + if (properties.containsKey(ENABLE_AUTO_ANALYZE) + && properties.get(ENABLE_AUTO_ANALYZE).equalsIgnoreCase("true")) { + ret = true; + } + return ret; + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 283ae2dcc6725d..0777f2c1edcace 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -3175,4 +3175,9 @@ public ConcurrentHashMap getIdToDb() { public Collection getAllDbs() { return new HashSet<>(idToDb.values()); } + + @Override + public boolean enableAutoAnalyze() { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java index d3d0fe18d905de..3ceca15c7e724c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java +++ b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java @@ -111,6 +111,7 @@ import org.apache.doris.persist.TableInfo; import org.apache.doris.persist.TablePropertyInfo; import org.apache.doris.persist.TableRenameColumnInfo; +import org.apache.doris.persist.TableStatsDeletionLog; import org.apache.doris.persist.TruncateTableInfo; import org.apache.doris.plugin.PluginInfo; import org.apache.doris.policy.DropPolicyLog; @@ -118,6 +119,7 @@ import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.workloadgroup.WorkloadGroup; import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; import org.apache.doris.transaction.TransactionState; @@ -846,6 +848,21 @@ public void readFields(DataInput in) throws IOException { isRead = true; break; } + case OperationType.OP_UPDATE_TABLE_STATS: { + data = TableStatsMeta.read(in); + isRead = true; + break; + } + case OperationType.OP_PERSIST_AUTO_JOB: { + data = AnalysisInfo.read(in); + isRead = true; + break; + } + case OperationType.OP_DELETE_TABLE_STATS: { + data = TableStatsDeletionLog.read(in); + isRead = true; + break; + } default: { IOException e = new IOException(); LOG.error("UNKNOWN Operation Type {}", opCode, e); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java index 1f7255b7990ace..aa8f4d6cc7cfda 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java @@ -86,8 +86,7 @@ public static Cost addChildCost(Plan plan, Cost planCost, Cost childCost, int in CostV1 planCostV1 = (CostV1) planCost; return new CostV1(childCostV1.getCpuCost() + planCostV1.getCpuCost(), childCostV1.getMemoryCost() + planCostV1.getMemoryCost(), - childCostV1.getNetworkCost() + planCostV1.getNetworkCost(), - childCostV1.getPenalty() + planCostV1.getPenalty()); + childCostV1.getNetworkCost() + planCostV1.getNetworkCost()); } @Override @@ -118,7 +117,7 @@ public Cost visitPhysicalStorageLayerAggregate( CostV1 costValue = (CostV1) storageLayerAggregate.getRelation().accept(this, context); // multiply a factor less than 1, so we can select PhysicalStorageLayerAggregate as far as possible return new CostV1(costValue.getCpuCost() * 0.7, costValue.getMemoryCost(), - costValue.getNetworkCost(), costValue.getPenalty()); + costValue.getNetworkCost()); } @Override @@ -150,14 +149,14 @@ public Cost visitPhysicalQuickSort( // TODO: consider two-phase sort and enforcer. Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); + + double childRowCount = childStatistics.getRowCount(); + double rowCount = statistics.getRowCount(); if (physicalQuickSort.getSortPhase().isGather()) { // Now we do more like two-phase sort, so penalise one-phase sort - statistics = statistics.withRowCount(statistics.getRowCount() * 100); + rowCount *= 100; } - return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + return CostV1.of(childRowCount, rowCount, childRowCount); } @Override @@ -165,14 +164,14 @@ public Cost visitPhysicalTopN(PhysicalTopN topN, PlanContext con // TODO: consider two-phase sort and enforcer. Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); + + double childRowCount = childStatistics.getRowCount(); + double rowCount = statistics.getRowCount(); if (topN.getSortPhase().isGather()) { // Now we do more like two-phase sort, so penalise one-phase sort - statistics = statistics.withRowCount(statistics.getRowCount() * 100); + rowCount *= 100; } - return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + return CostV1.of(childRowCount, rowCount, childRowCount); } @Override @@ -186,9 +185,9 @@ public Cost visitPhysicalPartitionTopN(PhysicalPartitionTopN par Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + childStatistics.getRowCount(), + statistics.getRowCount(), + childStatistics.getRowCount()); } @Override @@ -287,30 +286,38 @@ public Cost visitPhysicalHashJoin( pattern2: (L join1 Agg1) join2 agg2 in pattern2, join1 and join2 takes more time, but Agg1 and agg2 can be processed in parallel. */ - double penalty = HEAVY_OPERATOR_PUNISH_FACTOR - * Math.min(probeStats.getPenalty(), buildStats.getPenalty()); - if (buildStats.getWidth() >= 2) { - //penalty for right deep tree - penalty += rightRowCount; - } if (physicalHashJoin.getJoinType().isCrossJoin()) { return CostV1.of(leftRowCount + rightRowCount + outputRowCount, 0, - leftRowCount + rightRowCount, - penalty); + leftRowCount + rightRowCount + ); } if (context.isBroadcastJoin()) { - double broadcastJoinPenalty = broadCastJoinBalancePenalty(probeStats, buildStats); - return CostV1.of(leftRowCount * broadcastJoinPenalty + rightRowCount + outputRowCount, + // compared with shuffle join, bc join will be taken a penalty for both build and probe side; + // currently we use the following factor as the penalty factor: + // build side factor: totalInstanceNumber to the power of 2, standing for the additional effort for + // bigger cost for building hash table, taken on rightRowCount + // probe side factor: totalInstanceNumber to the power of 2, standing for the additional effort for + // bigger cost for ProbeWhenBuildSideOutput effort and ProbeWhenSearchHashTableTime + // on the output rows, taken on outputRowCount() + double probeSideFactor = 1.0; + double buildSideFactor = ConnectContext.get().getSessionVariable().getBroadcastRightTableScaleFactor(); + int parallelInstance = Math.max(1, ConnectContext.get().getSessionVariable().getParallelExecInstanceNum()); + int totalInstanceNumber = parallelInstance * beNumber; + if (buildSideFactor <= 1.0) { + // use totalInstanceNumber to the power of 2 as the default factor value + buildSideFactor = Math.pow(totalInstanceNumber, 0.5); + } + // TODO: since the outputs rows may expand a lot, penalty on it will cause bc never be chosen. + // will refine this in next generation cost model. + return CostV1.of(leftRowCount + rightRowCount * buildSideFactor + outputRowCount * probeSideFactor, rightRowCount, - 0, 0 ); } return CostV1.of(leftRowCount + rightRowCount + outputRowCount, rightRowCount, - 0, 0 ); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java index b5c5b50bd2e7b3..bf1cc425999f7c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java @@ -19,23 +19,19 @@ class CostV1 implements Cost { private static final CostV1 INFINITE = new CostV1(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, - Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); - private static final CostV1 ZERO = new CostV1(0, 0, 0, 0); + Double.POSITIVE_INFINITY); + private static final CostV1 ZERO = new CostV1(0, 0, 0); private final double cpuCost; private final double memoryCost; private final double networkCost; - //penalty for - // 1. right deep tree - // 2. right XXX join - private final double penalty; private final double cost; /** * Constructor of CostEstimate. */ - public CostV1(double cpuCost, double memoryCost, double networkCost, double penaltiy) { + public CostV1(double cpuCost, double memoryCost, double networkCost) { // TODO: fix stats cpuCost = Double.max(0, cpuCost); memoryCost = Double.max(0, memoryCost); @@ -43,11 +39,10 @@ public CostV1(double cpuCost, double memoryCost, double networkCost, double pena this.cpuCost = cpuCost; this.memoryCost = memoryCost; this.networkCost = networkCost; - this.penalty = penaltiy; CostWeight costWeight = CostWeight.get(); this.cost = costWeight.cpuWeight * cpuCost + costWeight.memoryWeight * memoryCost - + costWeight.networkWeight * networkCost + costWeight.penaltyWeight * penalty; + + costWeight.networkWeight * networkCost; } public CostV1(double cost) { @@ -55,7 +50,6 @@ public CostV1(double cost) { this.cpuCost = 0; this.networkCost = 0; this.memoryCost = 0; - this.penalty = 0; } public static CostV1 infinite() { @@ -78,28 +72,20 @@ public double getNetworkCost() { return networkCost; } - public double getPenalty() { - return penalty; - } - public double getValue() { return cost; } - public static CostV1 of(double cpuCost, double maxMemory, double networkCost, double rightDeepPenaltiy) { - return new CostV1(cpuCost, maxMemory, networkCost, rightDeepPenaltiy); - } - public static CostV1 of(double cpuCost, double maxMemory, double networkCost) { - return new CostV1(cpuCost, maxMemory, networkCost, 0); + return new CostV1(cpuCost, maxMemory, networkCost); } public static CostV1 ofCpu(double cpuCost) { - return new CostV1(cpuCost, 0, 0, 0); + return new CostV1(cpuCost, 0, 0); } public static CostV1 ofMemory(double memoryCost) { - return new CostV1(0, memoryCost, 0, 0); + return new CostV1(0, memoryCost, 0); } @Override @@ -107,7 +93,7 @@ public String toString() { StringBuilder sb = new StringBuilder(); sb.append("[").append((long) cpuCost).append("/") .append((long) memoryCost).append("/").append((long) networkCost) - .append("/").append((long) penalty).append("]"); + .append("/").append("]"); return sb.toString(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java index aa1f10aa47849e..c9cc43d0c29d9f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java @@ -27,7 +27,6 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalAssertNumRows; import org.apache.doris.nereids.trees.plans.physical.PhysicalDistribute; import org.apache.doris.nereids.trees.plans.physical.PhysicalFilter; -import org.apache.doris.nereids.trees.plans.physical.PhysicalHashAggregate; import org.apache.doris.nereids.trees.plans.physical.PhysicalHashJoin; import org.apache.doris.nereids.trees.plans.physical.PhysicalLimit; import org.apache.doris.nereids.trees.plans.physical.PhysicalProject; @@ -54,17 +53,6 @@ */ public class RuntimeFilterPruner extends PlanPostProcessor { - // ******************************* - // Physical plans - // ******************************* - @Override - public PhysicalHashAggregate visitPhysicalHashAggregate( - PhysicalHashAggregate agg, CascadesContext context) { - agg.child().accept(this, context); - context.getRuntimeFilterContext().addEffectiveSrcNode(agg); - return agg; - } - @Override public PhysicalQuickSort visitPhysicalQuickSort(PhysicalQuickSort sort, CascadesContext context) { sort.child().accept(this, context); @@ -165,7 +153,9 @@ public PhysicalAssertNumRows visitPhysicalAssertNumRows(PhysicalAssertNumRows buildNdvInProbeRange * (1 + ColumnStatistic.STATS_ERROR); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java index b8a7975a087fad..aa1903e7b37ebb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java @@ -151,7 +151,7 @@ public ColumnStatistic visitIf(If function, Statistics context) { return new ColumnStatisticBuilder() .setNdv(2) .setMinValue(0) - .setMaxValue(Double.MAX_VALUE) + .setMaxValue(Double.POSITIVE_INFINITY) .setAvgSizeByte(8) .setNumNulls(0) .build(); @@ -206,13 +206,15 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) { return ColumnStatistic.UNKNOWN; } double literalVal = literal.getDouble(); - ColumnStatisticBuilder columnStatBuilder = new ColumnStatisticBuilder(); - columnStatBuilder.setMaxValue(literalVal); - columnStatBuilder.setMinValue(literalVal); - columnStatBuilder.setNdv(1); - columnStatBuilder.setNumNulls(1); - columnStatBuilder.setAvgSizeByte(1); - return columnStatBuilder.build(); + return new ColumnStatisticBuilder() + .setMaxValue(literalVal) + .setMinValue(literalVal) + .setNdv(1) + .setNumNulls(1) + .setAvgSizeByte(1) + .setMinExpr(literal.toLegacyLiteral()) + .setMaxExpr(literal.toLegacyLiteral()) + .build(); } @Override @@ -241,13 +243,13 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, if (binaryArithmetic instanceof Add) { return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin) - .setMaxValue(leftMax + rightMax).setSelectivity(1.0) + .setMaxValue(leftMax + rightMax) .setMinExpr(null).setMaxExpr(null).build(); } if (binaryArithmetic instanceof Subtract) { return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax) - .setMaxValue(leftMax - rightMin).setSelectivity(1.0).setMinExpr(null) + .setMaxValue(leftMax - rightMin).setMinExpr(null) .setMaxExpr(null).build(); } // TODO: stat for multiply and divide produced by below algorithm may have huge deviation with reality. @@ -259,11 +261,11 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, leftMax * rightMax); double max = Math.max( Math.max( - Math.max(leftMin * rightMin, leftMin * rightMax), - leftMax * rightMin), + Math.max(leftMin * rightMin, leftMin * rightMax), + leftMax * rightMin), leftMax * rightMax); return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) - .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max).setSelectivity(1.0) + .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max) .setMaxExpr(null).setMinExpr(null).build(); } if (binaryArithmetic instanceof Divide || binaryArithmetic instanceof IntegralDivide) { @@ -279,7 +281,7 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, leftMax / noneZeroDivisor(rightMax)); return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min) - .setMaxValue(max).setSelectivity(1.0).build(); + .setMaxValue(max).build(); } if (binaryArithmetic instanceof Mod) { double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax)); @@ -309,13 +311,12 @@ public ColumnStatistic visitMin(Min min, Statistics context) { } /* we keep columnStat.min and columnStat.max, but set ndv=1. - if there is group-by keys, we will update ndv when visiting group clause + if there is group-by keys, we will update count when visiting group clause */ double width = min.child().getDataType().width(); - return new ColumnStatisticBuilder().setCount(1).setNdv(1).setAvgSizeByte(width).setNumNulls(width) - .setDataSize(child.getDataType().width()).setMinValue(columnStat.minValue) - .setMaxValue(columnStat.maxValue).setSelectivity(1.0) - .setMinExpr(null).build(); + return new ColumnStatisticBuilder().setCount(1).setNdv(1).setAvgSizeByte(width) + .setMinValue(columnStat.minValue).setMinExpr(columnStat.minExpr) + .setMaxValue(columnStat.maxValue).setMaxExpr(columnStat.maxExpr).build(); } @Override @@ -327,19 +328,20 @@ public ColumnStatistic visitMax(Max max, Statistics context) { } /* we keep columnStat.min and columnStat.max, but set ndv=1. - if there is group-by keys, we will update ndv when visiting group clause + if there is group-by keys, we will update count when visiting group clause */ int width = max.child().getDataType().width(); - return new ColumnStatisticBuilder().setCount(1D).setNdv(1D).setAvgSizeByte(width).setNumNulls(0) - .setDataSize(width).setMinValue(columnStat.minValue).setMaxValue(columnStat.maxValue) - .setSelectivity(1.0).setMaxExpr(null).setMinExpr(null).build(); + return new ColumnStatisticBuilder().setCount(1D).setNdv(1D).setAvgSizeByte(width) + .setMinValue(columnStat.minValue).setMinExpr(columnStat.minExpr) + .setMaxValue(columnStat.maxValue).setMaxExpr(columnStat.maxExpr) + .build(); } @Override public ColumnStatistic visitCount(Count count, Statistics context) { double width = count.getDataType().width(); return new ColumnStatisticBuilder().setCount(1D).setAvgSizeByte(width).setNumNulls(0) - .setDataSize(width).setMinValue(0).setMaxValue(context.getRowCount()).setSelectivity(1.0) + .setDataSize(width).setMinValue(0).setMaxValue(context.getRowCount()) .setMaxExpr(null).setMinExpr(null).build(); } @@ -367,7 +369,7 @@ public ColumnStatistic visitYear(Year year, Statistics context) { .setNumNulls(childStat.numNulls) .setDataSize(4 * childStat.count) .setMinValue(minYear) - .setMaxValue(maxYear).setSelectivity(1.0).setMinExpr(null).build(); + .setMaxValue(maxYear).setMinExpr(null).build(); } @Override @@ -378,7 +380,7 @@ public ColumnStatistic visitWeekOfYear(WeekOfYear weekOfYear, Statistics context .setNdv(54) .setAvgSizeByte(width) .setNumNulls(childStat.numNulls) - .setDataSize(1).setMinValue(1).setMaxValue(53).setSelectivity(1.0).setMinExpr(null) + .setDataSize(1).setMinValue(1).setMaxValue(53).setMinExpr(null) .build(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index c5ddbd285b37f4..f06c9d1cc4f4ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,10 +17,10 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; -import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.ComparisonPredicate; import org.apache.doris.nereids.trees.expressions.CompoundPredicate; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -28,6 +28,7 @@ import org.apache.doris.nereids.trees.expressions.GreaterThan; import org.apache.doris.nereids.trees.expressions.GreaterThanEqual; import org.apache.doris.nereids.trees.expressions.InPredicate; +import org.apache.doris.nereids.trees.expressions.IsNull; import org.apache.doris.nereids.trees.expressions.LessThan; import org.apache.doris.nereids.trees.expressions.LessThanEqual; import org.apache.doris.nereids.trees.expressions.Like; @@ -37,6 +38,7 @@ import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; +import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.statistics.Bucket; import org.apache.doris.statistics.ColumnStatistic; @@ -47,10 +49,11 @@ import org.apache.doris.statistics.Statistics; import org.apache.doris.statistics.StatisticsBuilder; +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; + import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.function.Predicate; @@ -81,7 +84,9 @@ public FilterEstimation(Set aggSlots) { public Statistics estimate(Expression expression, Statistics statistics) { // For a comparison predicate, only when it's left side is a slot and right side is a literal, we would // consider is a valid predicate. - return expression.accept(this, new EstimationContext(statistics)); + Statistics stats = expression.accept(this, new EstimationContext(statistics)); + stats.enforceValid(); + return stats; } @Override @@ -94,7 +99,7 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation Expression leftExpr = predicate.child(0); Expression rightExpr = predicate.child(1); Statistics leftStats = leftExpr.accept(this, context); - Statistics andStats = rightExpr.accept(new FilterEstimation(), + Statistics andStats = rightExpr.accept(this, new EstimationContext(leftStats)); if (predicate instanceof And) { return andStats; @@ -102,27 +107,29 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation Statistics rightStats = rightExpr.accept(this, context); double rowCount = leftStats.getRowCount() + rightStats.getRowCount() - andStats.getRowCount(); Statistics orStats = context.statistics.withRowCount(rowCount); - for (Map.Entry entry : orStats.columnStatistics().entrySet()) { - ColumnStatistic leftColStats = leftStats.findColumnStatistics(entry.getKey()); - ColumnStatistic rightColStats = rightStats.findColumnStatistics(entry.getKey()); - ColumnStatisticBuilder estimatedColStatsBuilder = new ColumnStatisticBuilder(entry.getValue()); - if (leftColStats.minValue <= rightColStats.minValue) { - estimatedColStatsBuilder.setMinValue(leftColStats.minValue); - estimatedColStatsBuilder.setMinExpr(leftColStats.minExpr); - } else { - estimatedColStatsBuilder.setMinValue(rightColStats.minValue); - estimatedColStatsBuilder.setMinExpr(rightColStats.minExpr); - } - if (leftColStats.maxValue >= rightColStats.maxValue) { - estimatedColStatsBuilder.setMaxValue(leftColStats.maxValue); - estimatedColStatsBuilder.setMaxExpr(leftColStats.maxExpr); - } else { - estimatedColStatsBuilder.setMaxValue(rightColStats.maxValue); - estimatedColStatsBuilder.setMaxExpr(rightColStats.maxExpr); + Set leftInputSlots = leftExpr.getInputSlots(); + Set rightInputSlots = rightExpr.getInputSlots(); + for (Slot slot : context.keyColumns) { + if (leftInputSlots.contains(slot) && rightInputSlots.contains(slot)) { + ColumnStatistic leftColStats = leftStats.findColumnStatistics(slot); + ColumnStatistic rightColStats = rightStats.findColumnStatistics(slot); + StatisticRange leftRange = StatisticRange.from(leftColStats, slot.getDataType()); + StatisticRange rightRange = StatisticRange.from(rightColStats, slot.getDataType()); + StatisticRange union = leftRange.union(rightRange); + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder( + context.statistics.findColumnStatistics(slot)); + colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr()) + .setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr()) + .setNdv(union.getDistinctValues()); + orStats.addColumnStats(slot, colBuilder.build()); } } return orStats; } + // should not come here + Preconditions.checkArgument(false, + "unsupported compound operator: %s in %s", + predicate.getClass().getName(), predicate.toSql()); return context.statistics; } @@ -172,25 +179,27 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon } private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - double val, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context, boolean contains) { if (statsForLeft.hasHistogram()) { - return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains); + return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, + statsForRight.maxValue, context, contains); } - //rightRange.distinctValues should not be used - StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv, - leftExpr.getDataType()); + StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, + statsForRight.maxValue, statsForRight.maxExpr, + statsForLeft.ndv, leftExpr.getDataType()); return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); } private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - double val, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context, boolean contains) { if (statsForLeft.hasHistogram()) { - return estimateGreaterThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains); + return estimateGreaterThanLiteralWithHistogram(leftExpr, statsForLeft, + statsForRight.minValue, context, contains); } - //rightRange.distinctValues should not be used - StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue, + StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, + statsForLeft.maxValue, statsForLeft.maxExpr, statsForLeft.ndv, leftExpr.getDataType()); return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); } @@ -204,12 +213,12 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, if (cp instanceof EqualTo || cp instanceof NullSafeEqual) { return estimateEqualTo(cp, statsForLeft, statsForRight, context); } else { - double val = statsForRight.maxValue; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, val, context, cp instanceof LessThanEqual); + return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, + context, cp instanceof LessThanEqual); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - return updateGreaterThanLiteral(cp.left(), statsForLeft, val, context, + return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context, cp instanceof GreaterThanEqual); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); @@ -234,19 +243,10 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats Statistics equalStats = context.statistics.withSel(selectivity); Expression left = cp.left(); - if (left instanceof Cast) { - left = ((Cast) left).child(); - } - if (left instanceof SlotReference) { - Slot leftSlot = (SlotReference) left; - //update min/max of cp.left - ColumnStatistic columnStats = equalStats.findColumnStatistics(leftSlot); - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(columnStats); - colStatsBuilder.setMaxValue(val); - colStatsBuilder.setMinValue(val); - colStatsBuilder.setNdv(1); - colStatsBuilder.setNumNulls(0); - equalStats.addColumnStats(leftSlot, colStatsBuilder.build()); + equalStats.addColumnStats(left, statsForRight); + context.addKeyIfSlot(left); + if (!(left instanceof SlotReference)) { + left.accept(new ColumnStatsAdjustVisitor(), equalStats); } return equalStats; } @@ -275,8 +275,12 @@ public Statistics visitInPredicate(InPredicate inPredicate, EstimationContext co return context.statistics.withSel(DEFAULT_IN_COEFFICIENT); } List options = inPredicate.getOptions(); - double maxOption = 0; - double minOption = Double.MAX_VALUE; + // init minOption and maxOption by compareExpr.max and compareExpr.min respectively, + // and then adjust min/max by options + double minOptionValue = compareExprStats.maxValue; + double maxOptionValue = compareExprStats.minValue; + LiteralExpr minOptionLiteral = compareExprStats.maxExpr; + LiteralExpr maxOptionLiteral = compareExprStats.minExpr; /* suppose A.(min, max) = (0, 10), A.ndv=10 A in ( 1, 2, 5, 100): validInOptCount = 3, that is (1, 2, 5) @@ -292,86 +296,199 @@ A not in (1, 2, 3, 100): A.(min, max) not changed A.selectivity = 7/10 */ - double validInOptCount = 0; + int validInOptCount = 0; double selectivity = 1.0; ColumnStatisticBuilder compareExprStatsBuilder = new ColumnStatisticBuilder(compareExprStats); - + int nonLiteralOptionCount = 0; for (Expression option : options) { ColumnStatistic optionStats = ExpressionEstimation.estimate(option, context.statistics); - double validOptionNdv = compareExprStats.ndvIntersection(optionStats); - if (validOptionNdv > 0.0) { - validInOptCount += validOptionNdv; - maxOption = Math.max(optionStats.maxValue, maxOption); - minOption = Math.min(optionStats.minValue, minOption); + if (option instanceof Literal) { + // remove the options which is out of compareExpr.range + if (compareExprStats.minValue <= optionStats.maxValue + && optionStats.maxValue <= compareExprStats.maxValue) { + validInOptCount++; + LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral(); + if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) { + maxOptionLiteral = optionLiteralExpr; + maxOptionValue = optionStats.maxValue; + } + + if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) { + minOptionLiteral = optionLiteralExpr; + minOptionValue = optionStats.minValue; + } + } + } else { + nonLiteralOptionCount++; + } + } + if (nonLiteralOptionCount > 0) { + // A in (x+1, ...) + // "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value. + // and hence, we do not adjust compareExpr.stats.range. + int newNdv = nonLiteralOptionCount + validInOptCount; + if (newNdv < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(newNdv); + selectivity = StatsMathUtil.divide(newNdv, compareExprStats.ndv); + } else { + selectivity = 1.0; + } + } else { + maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue); + minOptionValue = Math.max(minOptionValue, compareExprStats.minValue); + compareExprStatsBuilder.setMaxValue(maxOptionValue); + compareExprStatsBuilder.setMaxExpr(maxOptionLiteral); + compareExprStatsBuilder.setMinValue(minOptionValue); + compareExprStatsBuilder.setMinExpr(minOptionLiteral); + if (validInOptCount < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(validInOptCount); + selectivity = StatsMathUtil.divide(validInOptCount, compareExprStats.ndv); + } else { + selectivity = 1.0; } } - maxOption = Math.min(maxOption, compareExprStats.maxValue); - minOption = Math.max(minOption, compareExprStats.minValue); - compareExprStatsBuilder.setMaxValue(maxOption); - compareExprStatsBuilder.setMinValue(minOption); - - selectivity = StatsMathUtil.minNonNaN(1.0, validInOptCount / compareExprStats.ndv); - compareExprStatsBuilder.setNdv(validInOptCount); Statistics estimated = new Statistics(context.statistics); estimated = estimated.withSel(selectivity); - if (compareExpr instanceof SlotReference) { - estimated.addColumnStats(compareExpr, - compareExprStatsBuilder.build()); - } + estimated.addColumnStats(compareExpr, + compareExprStatsBuilder.build()); + context.addKeyIfSlot(compareExpr); return estimated; } + // Right Now, we just assume the selectivity is 1 when stats is Unknown + private Statistics handleUnknownCase(EstimationContext context) { + return context.statistics; + } + @Override public Statistics visitNot(Not not, EstimationContext context) { - Statistics childStats = new FilterEstimation().estimate(not.child(), context.statistics); + if (context.statistics.isInputSlotsUnknown(not.getInputSlots())) { + return handleUnknownCase(context); + } + Expression child = not.child(); + Statistics childStats = child.accept(this, context); //if estimated rowCount is 0, adjust to 1 to make upper join reorder reasonable. double rowCount = Math.max(context.statistics.getRowCount() - childStats.getRowCount(), 1); StatisticsBuilder statisticsBuilder = new StatisticsBuilder(context.statistics).setRowCount(rowCount); - for (Entry entry : context.statistics.columnStatistics().entrySet()) { - Expression expr = entry.getKey(); - ColumnStatistic originColStats = entry.getValue(); - ColumnStatistic childColStats = childStats.findColumnStatistics(expr); - double originNonNullCount = Math.max(originColStats.count - originColStats.numNulls, 0); - double childNonNullCount = Math.max(childColStats.count - childColStats.numNulls, 0); - double supersetValuesPerDistinctValue = StatsMathUtil.divide(originNonNullCount, originColStats.ndv); - double subsetValuesPerDistinctValue = StatsMathUtil.divide(childNonNullCount, childColStats.ndv); - double ndv; - if (supersetValuesPerDistinctValue <= subsetValuesPerDistinctValue) { - ndv = Math.max(originColStats.ndv - childColStats.ndv, 0); - } else { - ndv = originColStats.ndv; + // update key col stats + for (Slot slot : not.child().getInputSlots()) { + ColumnStatistic originColStats = context.statistics.findColumnStatistics(slot); + ColumnStatistic childColStats = childStats.findColumnStatistics(slot); + if (context.isKeySlot(slot)) { + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childColStats); + // update column stats for + // 1. not (A=B) + // 2. not A in (...) + // 3. not A is null + // 4. not A like XXX + colBuilder.setNumNulls(0); + Preconditions.checkArgument( + child instanceof EqualTo + || child instanceof InPredicate + || child instanceof IsNull + || child instanceof Like, + "Not-predicate meet unexpected child: %s", child.toSql()); + if (child instanceof Like) { + rowCount = context.statistics.getRowCount() - childStats.getRowCount(); + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + } else if (child instanceof InPredicate) { + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } else if (child instanceof IsNull) { + colBuilder.setNdv(originColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } else if (child instanceof EqualTo) { + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } + statisticsBuilder.putColumnStatistics(slot, colBuilder.build()); } - double nullCount = Math.max(originColStats.numNulls - childColStats.numNulls, 0); - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(originColStats) - .setNdv(ndv) - .setNumNulls(nullCount) - .build(); - statisticsBuilder.putColumnStatistics(expr, columnStatistic); } + return statisticsBuilder.build(); } + @Override + public Statistics visitIsNull(IsNull isNull, EstimationContext context) { + ColumnStatistic childStats = ExpressionEstimation.estimate(isNull.child(), context.statistics); + if (childStats.isUnKnown()) { + return new StatisticsBuilder(context.statistics).build(); + } + double outputRowCount = childStats.numNulls; + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childStats); + colBuilder.setCount(outputRowCount).setNumNulls(outputRowCount) + .setMaxValue(Double.POSITIVE_INFINITY) + .setMinValue(Double.NEGATIVE_INFINITY) + .setNdv(0); + StatisticsBuilder builder = new StatisticsBuilder(context.statistics); + builder.putColumnStatistics(isNull.child(), colBuilder.build()); + context.addKeyIfSlot(isNull.child()); + return builder.build(); + } + static class EstimationContext { private final Statistics statistics; + private final Set keyColumns = Sets.newHashSet(); + public EstimationContext(Statistics statistics) { this.statistics = statistics; } + + public void addKeyIfSlot(Expression expr) { + if (expr instanceof Slot) { + keyColumns.add((Slot) expr); + } + } + + public boolean isKeySlot(Expression expr) { + if (expr instanceof Slot) { + return keyColumns.contains((Slot) expr); + } + return false; + } } private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { StatisticRange leftRange = - new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType()); + new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, + leftStats.ndv, leftExpr.getDataType()); StatisticRange intersectRange = leftRange.cover(rightRange); - ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) - .setMinValue(intersectRange.getLow()) - .setMaxValue(intersectRange.getHigh()) - .setNdv(intersectRange.getDistinctValues()); - double sel = leftRange.overlapPercentWith(rightRange); - Statistics updatedStatistics = context.statistics.withSel(sel); - leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); + + ColumnStatisticBuilder leftColumnStatisticBuilder; + Statistics updatedStatistics; + if (intersectRange.isEmpty()) { + updatedStatistics = context.statistics.withRowCount(0); + leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) + .setMinValue(Double.NEGATIVE_INFINITY) + .setMinExpr(null) + .setMaxValue(Double.POSITIVE_INFINITY) + .setMaxExpr(null) + .setNdv(0) + .setCount(0); + } else { + leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) + .setMinValue(intersectRange.getLow()) + .setMinExpr(intersectRange.getLowExpr()) + .setMaxValue(intersectRange.getHigh()) + .setMaxExpr(intersectRange.getHighExpr()) + .setNdv(intersectRange.getDistinctValues()); + double sel = leftRange.overlapPercentWith(rightRange); + updatedStatistics = context.statistics.withSel(sel); + leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); + } updatedStatistics.addColumnStats(leftExpr, leftColumnStatisticBuilder.build()); + context.addKeyIfSlot(leftExpr); leftExpr.accept(new ColumnStatsAdjustVisitor(), updatedStatistics); return updatedStatistics; } @@ -381,36 +498,17 @@ private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatis StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType()); StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType()); StatisticRange leftIntersectRight = leftRange.intersect(rightRange); - StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight); - ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats); - leftBuilder.setNdv(leftIntersectRight.getDistinctValues()); - leftBuilder.setMinValue(leftIntersectRight.getLow()); - leftBuilder.setMaxValue(leftIntersectRight.getHigh()); - ColumnStatisticBuilder rightBuilder = new ColumnStatisticBuilder(rightStats); - rightBuilder.setNdv(rightIntersectLeft.getDistinctValues()); - rightBuilder.setMinValue(rightIntersectLeft.getLow()); - rightBuilder.setMaxValue(rightIntersectLeft.getDistinctValues()); - double sel; - double reduceRatio = 0.25; - double bothSideReducedRatio = 0.9; - if (!leftStats.rangeChanged() && !rightStats.rangeChanged() - && leftStats.ndv < leftStats.getOriginalNdv() * bothSideReducedRatio - && rightStats.ndv < rightStats.getOriginalNdv() * bothSideReducedRatio) { - double sel1; - if (leftStats.ndv > rightStats.ndv) { - sel1 = 1 / StatsMathUtil.nonZeroDivisor(leftStats.ndv); - } else { - sel1 = 1 / StatsMathUtil.nonZeroDivisor(rightStats.ndv); - } - double sel2 = Math.min(rightStats.ndv / rightStats.getOriginalNdv(), - leftStats.ndv / leftStats.getOriginalNdv()); - sel = sel1 * Math.pow(sel2, reduceRatio); - } else { - sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); - } + StatisticRange intersect = rightRange.intersect(leftIntersectRight); + ColumnStatisticBuilder intersectBuilder = new ColumnStatisticBuilder(leftStats); + intersectBuilder.setNdv(intersect.getDistinctValues()); + intersectBuilder.setMinValue(intersect.getLow()); + intersectBuilder.setMaxValue(intersect.getHigh()); + double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); Statistics updatedStatistics = context.statistics.withSel(sel); - updatedStatistics.addColumnStats(leftExpr, leftBuilder.build()); - updatedStatistics.addColumnStats(rightExpr, rightBuilder.build()); + updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build()); + updatedStatistics.addColumnStats(rightExpr, intersectBuilder.build()); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return updatedStatistics; } @@ -426,6 +524,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati context.statistics.getRowCount() - rightStats.numNulls)); statistics.addColumnStats(leftExpr, new ColumnStatisticBuilder(leftStats).setNumNulls(0.0).build()); statistics.addColumnStats(rightExpr, new ColumnStatisticBuilder(rightStats).setNumNulls(0.0).build()); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return statistics; } double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); @@ -433,8 +533,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati if (leftOverlapPercent == 0) { return context.statistics.withRowCount(0.0); } - StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, - rightStats.minValue, Double.NaN, leftExpr.getDataType()); + StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, + rightStats.minValue, rightStats.minExpr, Double.NaN, leftExpr.getDataType()); double leftAlwaysLessThanRightPercent = 0; if (leftRange.getLow() < rightRange.getLow()) { leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange); @@ -448,8 +548,10 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati double rightOverlappingRangeFraction = rightRange.overlapPercentWith(leftRange); double rightAlwaysGreaterRangeFraction = 0; if (leftRange.getHigh() < rightRange.getHigh()) { - rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(), - rightRange.getHigh(), Double.NaN, rightExpr.getDataType())); + rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange( + leftRange.getHigh(), leftRange.getHighExpr(), + rightRange.getHigh(), rightRange.getHighExpr(), + Double.NaN, rightExpr.getDataType())); } ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats) .setMinValue(Math.max(leftRange.getLow(), rightRange.getLow())) @@ -460,6 +562,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati double sel = leftAlwaysLessThanRightPercent + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return context.statistics.withSel(sel) .addColumnStats(leftExpr, leftColumnStatistic) .addColumnStats(rightExpr, rightColumnStatistic); @@ -493,6 +597,7 @@ private Statistics estimateLessThanLiteralWithHistogram(Expression leftExpr, Col .setMaxValue(numVal) .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } } @@ -520,7 +625,7 @@ private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, double overlapCountInBucket = overlapPercentInBucket * bucket.count; double sel = StatsMathUtil.minNonNaN(1, (leftHist.size() - bucket.preSum - (bucket.count - overlapCountInBucket)) - / context.statistics.getRowCount()); + / context.statistics.getRowCount()); List updatedBucketList = new ArrayList<>(); updatedBucketList.add(new Bucket(numVal, bucket.upper, overlapPercentInBucket * bucket.count, 0, overlapPercentInBucket * bucket.ndv)); @@ -529,6 +634,7 @@ private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, .setMaxValue(numVal) .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } } @@ -556,11 +662,24 @@ private Statistics estimateEqualToWithHistogram(Expression leftExpr, ColumnStati .setMaxValue(numVal) .setMinValue(numVal) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } @Override public Statistics visitLike(Like like, EstimationContext context) { - return context.statistics.withSel(DEFAULT_LIKE_COMPARISON_SELECTIVITY); + StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics); + statsBuilder.setRowCount(context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY); + if (like.left() instanceof Slot) { + ColumnStatistic origin = context.statistics.findColumnStatistics(like.left()); + Preconditions.checkArgument(origin != null, + "col stats not found. slot=%s in %s", + like.left().toSql(), like.toSql()); + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin); + colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0); + statsBuilder.putColumnStatistics(like.left(), colBuilder.build()); + context.addKeyIfSlot(like.left()); + } + return statsBuilder.build(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index a40e409fae40c0..ef4575e3308d7a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -72,16 +72,7 @@ private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics le return false; } - private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { - double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); - rowCount = Math.max(1, rowCount); - return new StatisticsBuilder() - .setRowCount(rowCount) - .putColumnStatistics(leftStats.columnStatistics()) - .putColumnStatistics(rightStats.columnStatistics()) - .build(); - } + private static Statistics estimateHashJoin(Statistics leftStats, Statistics rightStats, Join join) { /* * When we estimate filter A=B, * if any side of equation, A or B, is almost unique, the confidence level of estimation is high. @@ -129,7 +120,7 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig .putColumnStatistics(rightStats.columnStatistics()) .build(); - double outputRowCount = 1; + double outputRowCount; if (!trustableConditions.isEmpty()) { List joinConditionSels = trustableConditions.stream() .map(expression -> estimateJoinConditionSel(crossJoinStats, expression)) @@ -138,20 +129,47 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig double sel = 1.0; double denominator = 1.0; - for (int i = 0; i < joinConditionSels.size(); i++) { - sel *= Math.pow(joinConditionSels.get(i), 1 / denominator); + for (Double joinConditionSel : joinConditionSels) { + sel *= Math.pow(joinConditionSel, 1 / denominator); denominator *= 2; } outputRowCount = Math.max(1, crossJoinStats.getRowCount() * sel); outputRowCount = outputRowCount * Math.pow(0.9, unTrustableCondition.size()); - innerJoinStats = crossJoinStats.updateRowCountOnly(outputRowCount); } else { outputRowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); Optional ratio = unTrustEqualRatio.stream().min(Double::compareTo); if (ratio.isPresent()) { outputRowCount = Math.max(1, outputRowCount * ratio.get()); } - innerJoinStats = crossJoinStats.updateRowCountOnly(outputRowCount); + } + innerJoinStats = crossJoinStats.withRowCountAndEnforceValid(outputRowCount); + return innerJoinStats; + } + + private static Statistics estimateNestLoopJoin(Statistics leftStats, Statistics rightStats, Join join) { + return new StatisticsBuilder() + .setRowCount(Math.max(1, leftStats.getRowCount() * rightStats.getRowCount())) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } + + private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); + rowCount = Math.max(1, rowCount); + return new StatisticsBuilder() + .setRowCount(rowCount) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } + + Statistics innerJoinStats; + if (join.getHashJoinConjuncts().isEmpty()) { + innerJoinStats = estimateNestLoopJoin(leftStats, rightStats, join); + } else { + innerJoinStats = estimateHashJoin(leftStats, rightStats, join); } if (!join.getOtherJoinConjuncts().isEmpty()) { @@ -162,9 +180,6 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig innerJoinStats = new StatisticsBuilder(innerJoinStats).setRowCount(1).build(); } } - - innerJoinStats.setWidth(leftStats.getWidth() + rightStats.getWidth()); - innerJoinStats.setPenalty(0); return innerJoinStats; } @@ -242,10 +257,9 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri double baseRowCount = join.getJoinType().isLeftSemiOrAntiJoin() ? leftStats.getRowCount() : rightStats.getRowCount(); rowCount = Math.min(innerJoinStats.getRowCount(), baseRowCount); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else { StatisticsBuilder builder; - double originalRowCount = leftStats.getRowCount(); if (join.getJoinType().isLeftSemiOrAntiJoin()) { builder = new StatisticsBuilder(leftStats); builder.setRowCount(rowCount); @@ -253,10 +267,9 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri //right semi or anti builder = new StatisticsBuilder(rightStats); builder.setRowCount(rowCount); - originalRowCount = rightStats.getRowCount(); } Statistics outputStats = builder.build(); - outputStats.fix(rowCount, originalRowCount); + outputStats.enforceValid(); return outputStats; } } @@ -276,15 +289,15 @@ public static Statistics estimate(Statistics leftStats, Statistics rightStats, J Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(leftStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(leftStats.getRowCount(), rowCount); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.RIGHT_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(rightStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(rowCount, rightStats.getRowCount()); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.FULL_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); - return innerJoinStats.withRowCount(leftStats.getRowCount() + return innerJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount() + rightStats.getRowCount() + innerJoinStats.getRowCount()); } else if (joinType == JoinType.CROSS_JOIN) { return new StatisticsBuilder() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 6910f5b4af3b5c..48ea4c9800daef 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -17,10 +17,7 @@ package org.apache.doris.nereids.stats; -import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.SchemaTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; @@ -620,7 +617,7 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { double rowCount = catalogRelation.getTable().estimatedRowCount(); for (SlotReference slotReference : slotSet) { String colName = slotReference.getName(); - boolean shouldIgnoreThisCol = shouldIgnoreCol(table, slotReference.getColumn().get()); + boolean shouldIgnoreThisCol = StatisticConstants.shouldIgnoreCol(table, slotReference.getColumn().get()); if (colName == null) { throw new RuntimeException(String.format("Invalid slot: %s", slotReference.getExprId())); @@ -658,16 +655,20 @@ private Statistics computeTopN(TopN topN) { } private Statistics computePartitionTopN(PartitionTopN partitionTopN) { - Statistics stats = groupExpression.childStatistics(0); - double rowCount = stats.getRowCount(); + Statistics childStats = groupExpression.childStatistics(0); + double rowCount = childStats.getRowCount(); List partitionKeys = partitionTopN.getPartitionKeys(); if (!partitionTopN.hasGlobalLimit() && !partitionKeys.isEmpty()) { // If there is no global limit. So result for the cardinality estimation is: // NDV(partition key) * partitionLimit - Map childSlotToColumnStats = stats.columnStatistics(); List partitionByKeyStats = partitionKeys.stream() - .filter(childSlotToColumnStats::containsKey) - .map(childSlotToColumnStats::get) + .map(partitionKey -> { + ColumnStatistic partitionKeyStats = childStats.findColumnStatistics(partitionKey); + if (partitionKeyStats == null) { + partitionKeyStats = new ExpressionEstimation().visit(partitionKey, childStats); + } + return partitionKeyStats; + }) .filter(s -> !s.isUnKnown) .collect(Collectors.toList()); if (partitionByKeyStats.isEmpty()) { @@ -675,7 +676,7 @@ private Statistics computePartitionTopN(PartitionTopN partitionTopN) { rowCount = rowCount * DEFAULT_COLUMN_NDV_RATIO; } else { rowCount = Math.min(rowCount, partitionByKeyStats.stream().map(s -> s.ndv) - .max(Double::compare).get()); + .max(Double::compare).get() * partitionTopN.getPartitionLimit()); } } else { rowCount = Math.min(rowCount, partitionTopN.getPartitionLimit()); @@ -683,7 +684,7 @@ private Statistics computePartitionTopN(PartitionTopN partitionTopN) { // TODO: for the filter push down window situation, we will prune the row count twice // because we keep the pushed down filter. And it will be calculated twice, one of them in 'PartitionTopN' // and the other is in 'Filter'. It's hard to dismiss. - return stats.updateRowCountOnly(rowCount); + return childStats.withRowCountAndEnforceValid(rowCount); } private Statistics computeLimit(Limit limit) { @@ -752,9 +753,7 @@ private Statistics computeAggregate(Aggregate aggregate) { builder.setDataSize(rowCount * outputExpression.getDataType().width()); slotToColumnStats.put(outputExpression.toSlot(), columnStat); } - return new Statistics(rowCount, slotToColumnStats, childStats.getWidth(), - childStats.getPenalty() + childStats.getRowCount()); - // TODO: Update ColumnStats properly, add new mapping from output slot to ColumnStats + return new Statistics(rowCount, slotToColumnStats); } private Statistics computeRepeat(Repeat repeat) { @@ -772,8 +771,7 @@ private Statistics computeRepeat(Repeat repeat) { .setDataSize(stats.dataSize < 0 ? stats.dataSize : stats.dataSize * groupingSetNum); return Pair.of(kv.getKey(), columnStatisticBuilder.build()); }).collect(Collectors.toMap(Pair::key, Pair::value)); - return new Statistics(rowCount < 0 ? rowCount : rowCount * groupingSetNum, columnStatisticMap, - childStats.getWidth(), childStats.getPenalty()); + return new Statistics(rowCount < 0 ? rowCount : rowCount * groupingSetNum, columnStatisticMap); } private Statistics computeProject(Project project) { @@ -783,7 +781,7 @@ private Statistics computeProject(Project project) { ColumnStatistic columnStatistic = ExpressionEstimation.estimate(projection, childStats); return new SimpleEntry<>(projection.toSlot(), columnStatistic); }).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (item1, item2) -> item1)); - return new Statistics(childStats.getRowCount(), columnsStats, childStats.getWidth(), childStats.getPenalty()); + return new Statistics(childStats.getRowCount(), columnsStats); } private Statistics computeOneRowRelation(List projects) { @@ -1067,16 +1065,4 @@ public Statistics visitPhysicalCTEAnchor( return groupExpression.childStatistics(1); } - private boolean shouldIgnoreCol(TableIf tableIf, Column c) { - if (tableIf instanceof SchemaTable) { - return true; - } - if (tableIf instanceof OlapTable) { - OlapTable olapTable = (OlapTable) tableIf; - if (StatisticConstants.STATISTICS_DB_BLACK_LIST.contains(olapTable.getQualifiedDbName())) { - return true; - } - } - return !c.isVisible(); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java new file mode 100644 index 00000000000000..db0c78c1f78f57 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.exceptions.UnboundException; +import org.apache.doris.nereids.trees.expressions.shape.LeafExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.MapType; + +import com.google.common.collect.ImmutableList; + +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Properties + */ +public class Properties extends Expression implements LeafExpression { + + private final Map keyValues; + + public Properties(Map properties) { + super(ImmutableList.of()); + this.keyValues = Objects.requireNonNull(properties, "properties can not be null"); + } + + public Map getMap() { + return keyValues; + } + + @Override + public boolean nullable() { + return false; + } + + @Override + public DataType getDataType() throws UnboundException { + return MapType.SYSTEM_DEFAULT; + } + + @Override + public String toSql() { + return getMap() + .entrySet() + .stream() + .map(kv -> "'" + kv.getKey() + "' = '" + kv.getValue() + "'") + .collect(Collectors.joining(", ")); + } + + @Override + public String toString() { + return "Properties(" + toSql() + ")"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + Properties that = (Properties) o; + return Objects.equals(keyValues, that.keyValues); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), keyValues); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitProperties(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java index d82878c4a48f2a..c66684cd22c657 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.HdfsTableValuedFunction; @@ -30,13 +30,13 @@ /** hdfs */ public class Hdfs extends TableValuedFunction { - public Hdfs(TVFProperties properties) { + public Hdfs(Properties properties) { super("hdfs", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, (List) getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, (List) getArgumentsTypes()); } @Override @@ -46,7 +46,7 @@ protected TableValuedFunctionIf toCatalogFunction() { return new HdfsTableValuedFunction(arguments); } catch (Throwable t) { throw new AnalysisException("Can not build HdfsTableValuedFunction by " - + this + ": " + t.getMessage(), t); + + this + ": " + t.getMessage(), t); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java index cd4169c1c9592a..d45a4c939433f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.LocalTableValuedFunction; @@ -31,13 +31,13 @@ * local */ public class Local extends TableValuedFunction { - public Local(TVFProperties properties) { + public Local(Properties properties) { super("local", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, getArgumentsTypes()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java index 0043ab9c1f7835..c5febcf9749f44 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java @@ -24,8 +24,8 @@ import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.Slot; -import org.apache.doris.nereids.trees.expressions.TVFProperties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.BigIntType; import org.apache.doris.statistics.ColumnStatistic; @@ -42,7 +42,7 @@ /** Numbers */ public class Numbers extends TableValuedFunction { - public Numbers(TVFProperties properties) { + public Numbers(Properties properties) { super("numbers", properties); } @@ -72,7 +72,7 @@ public Statistics computeStats(List slots) { Map columnToStatistics = Maps.newHashMap(); ColumnStatistic columnStat = new ColumnStatisticBuilder() .setCount(rowNum).setNdv(rowNum).setAvgSizeByte(8).setNumNulls(0).setDataSize(8).setMinValue(0) - .setMaxValue(rowNum - 1).setSelectivity(1.0 / rowNum) + .setMaxValue(rowNum - 1) .setMinExpr(new IntLiteral(0, Type.BIGINT)) .setMaxExpr(new IntLiteral(rowNum - 1, Type.BIGINT)) .build(); @@ -101,7 +101,7 @@ public PhysicalProperties getPhysicalProperties() { @Override public Numbers withChildren(List children) { Preconditions.checkArgument(children().size() == 1 - && children().get(0) instanceof TVFProperties); - return new Numbers((TVFProperties) children.get(0)); + && children().get(0) instanceof Properties); + return new Numbers((Properties) children.get(0)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java index d871219b7a8e30..29d8ad08218614 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.S3TableValuedFunction; @@ -29,13 +29,13 @@ /** s3 */ public class S3 extends TableValuedFunction { - public S3(TVFProperties properties) { + public S3(Properties properties) { super("s3", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, getArgumentsTypes()); } @Override @@ -45,7 +45,7 @@ protected TableValuedFunctionIf toCatalogFunction() { return new S3TableValuedFunction(arguments); } catch (Throwable t) { throw new AnalysisException("Can not build S3TableValuedFunction by " - + this + ": " + t.getMessage(), t); + + this + ": " + t.getMessage(), t); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java index 6443eab9728539..5acc73eb75a9f8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java @@ -23,8 +23,8 @@ import org.apache.doris.nereids.exceptions.UnboundException; import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.Slot; -import org.apache.doris.nereids.trees.expressions.TVFProperties; import org.apache.doris.nereids.trees.expressions.functions.BoundFunction; import org.apache.doris.nereids.trees.expressions.functions.CustomSignature; import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; @@ -57,7 +57,7 @@ public abstract class TableValuedFunction extends BoundFunction implements Unary } }); - public TableValuedFunction(String functionName, TVFProperties tvfProperties) { + public TableValuedFunction(String functionName, Properties tvfProperties) { super(functionName, tvfProperties); } @@ -78,8 +78,8 @@ public Statistics computeStats(List slots) { return new Statistics(0, columnToStatistics); } - public TVFProperties getTVFProperties() { - return (TVFProperties) child(0); + public Properties getTVFProperties() { + return (Properties) child(0); } public final String getTableName() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index 4517f444e0ba20..d95f35a6f69ca1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -63,6 +63,7 @@ import org.apache.doris.nereids.trees.expressions.NullSafeEqual; import org.apache.doris.nereids.trees.expressions.Or; import org.apache.doris.nereids.trees.expressions.OrderExpression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.ScalarSubquery; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; @@ -117,6 +118,10 @@ public abstract class ExpressionVisitor public abstract R visit(Expression expr, C context); + public R visitProperties(Properties properties, C context) { + return visit(properties, context); + } + @Override public R visitAggregateFunction(AggregateFunction aggregateFunction, C context) { return visitBoundFunction(aggregateFunction, context); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java index 40e0dae0fdb47a..c576dcb9933a42 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java @@ -29,6 +29,8 @@ public class MapType extends DataType { public static final MapType INSTANCE = new MapType(); + public static final MapType SYSTEM_DEFAULT = new MapType(); + public static final int WIDTH = 24; private MapType() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java index be07a7ee7b825f..e1097df65f1404 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java @@ -26,7 +26,18 @@ */ public class AnyDataType implements AbstractDataType { - public static final AnyDataType INSTANCE = new AnyDataType(); + public static final AnyDataType INSTANCE_WITHOUT_INDEX = new AnyDataType(-1); + + public static final AnyDataType INSTANCE = new AnyDataType(-1); + + private final int index; + + public AnyDataType(int index) { + if (index < 0) { + index = -1; + } + this.index = index; + } @Override public DataType defaultConcreteType() { @@ -47,4 +58,9 @@ public Type toCatalogDataType() { public String simpleString() { return "any"; } + + public int getIndex() { + return index; + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java index b0540abf9ec22f..efe6104247888f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java @@ -82,6 +82,7 @@ import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.workloadgroup.WorkloadGroup; import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; import org.apache.doris.transaction.TransactionState; @@ -1070,16 +1071,16 @@ public static void loadJournal(Env env, Long logId, JournalEntity journal) { env.getBinlogManager().addBarrierLog(log, logId); break; } - // For backward compatible with 2.0.3 case OperationType.OP_UPDATE_TABLE_STATS: { + env.getAnalysisManager().replayUpdateTableStatsStatus((TableStatsMeta) journal.getData()); break; } - // For backward compatible with 2.0.3 case OperationType.OP_PERSIST_AUTO_JOB: { + env.getAnalysisManager().replayPersistSysJob((AnalysisInfo) journal.getData()); break; } - // For backward compatible with 2.0.3 case OperationType.OP_DELETE_TABLE_STATS: { + env.getAnalysisManager().replayTableStatsDeletion((TableStatsDeletionLog) journal.getData()); break; } default: { @@ -1889,4 +1890,16 @@ public long logBarrier(BarrierLog log) { LOG.info("logId {}, barrier {}", logId, log); return logId; } + + public void logCreateTableStats(TableStatsMeta tableStats) { + logEdit(OperationType.OP_UPDATE_TABLE_STATS, tableStats); + } + + public void logAutoJob(AnalysisInfo analysisInfo) { + logEdit(OperationType.OP_PERSIST_AUTO_JOB, analysisInfo); + } + + public void logDeleteTableStats(TableStatsDeletionLog log) { + logEdit(OperationType.OP_DELETE_TABLE_STATS, log); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java new file mode 100644 index 00000000000000..4016ff0139e79b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.persist; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class TableStatsDeletionLog implements Writable { + + @SerializedName("id") + public final long id; + + public TableStatsDeletionLog(long id) { + this.id = id; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static TableStatsDeletionLog read(DataInput dataInput) throws IOException { + return GsonUtils.GSON.fromJson(Text.readString(dataInput), TableStatsDeletionLog.class); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/meta/MetaPersistMethod.java b/fe/fe-core/src/main/java/org/apache/doris/persist/meta/MetaPersistMethod.java index bdc3a5a2246696..b3a9152c6a5e6f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/meta/MetaPersistMethod.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/meta/MetaPersistMethod.java @@ -224,7 +224,9 @@ public static MetaPersistMethod create(String name) throws NoSuchMethodException metaPersistMethod.writeMethod = Env.class.getDeclaredMethod("saveBinlogs", CountingDataOutputStream.class, long.class); break; + case "AnalysisMgr": + case "AnalysisMgrV2": metaPersistMethod.readMethod = Env.class.getDeclaredMethod("loadAnalysisManager", DataInputStream.class, long.class); metaPersistMethod.writeMethod = diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/meta/PersistMetaModules.java b/fe/fe-core/src/main/java/org/apache/doris/persist/meta/PersistMetaModules.java index 6e99a6757fe1be..ed5210e49f7c54 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/meta/PersistMetaModules.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/meta/PersistMetaModules.java @@ -39,11 +39,11 @@ public class PersistMetaModules { "globalVariable", "cluster", "broker", "resources", "exportJob", "syncJob", "backupHandler", "paloAuth", "transactionState", "colocateTableIndex", "routineLoadJobs", "loadJobV2", "smallFiles", "plugins", "deleteHandler", "sqlBlockRule", "policy", "mtmvJobManager", "globalFunction", "workloadGroups", - "binlogs", "resourceGroups", "AnalysisMgr"); + "binlogs", "resourceGroups", "AnalysisMgrV2"); // Modules in this list is deprecated and will not be saved in meta file. (also should not be in MODULE_NAMES) public static final ImmutableList DEPRECATED_MODULE_NAMES = ImmutableList.of( - "loadJob", "cooldownJob"); + "loadJob", "cooldownJob", "AnalysisMgr"); static { MODULES_MAP = Maps.newHashMap(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java new file mode 100644 index 00000000000000..40f870eee11c0c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +import org.apache.doris.analysis.InsertStmt; +import org.apache.doris.analysis.Queriable; +import org.apache.doris.analysis.StatementBase; +import org.apache.doris.catalog.Env; +import org.apache.doris.cluster.ClusterNamespace; +import org.apache.doris.common.Config; +import org.apache.doris.common.util.DebugUtil; +import org.apache.doris.metric.MetricRepo; +import org.apache.doris.plugin.AuditEvent.EventType; +import org.apache.doris.qe.QueryState.MysqlStateType; +import org.apache.doris.service.FrontendOptions; + +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.context.Context; +import org.apache.commons.codec.digest.DigestUtils; + +public class AuditLogHelper { + + public static void logAuditLog(ConnectContext ctx, String origStmt, StatementBase parsedStmt, + org.apache.doris.proto.Data.PQueryStatistics statistics, boolean printFuzzyVariables) { + origStmt = origStmt.replace("\n", " "); + // slow query + long endTime = System.currentTimeMillis(); + long elapseMs = endTime - ctx.getStartTime(); + SpanContext spanContext = Span.fromContext(Context.current()).getSpanContext(); + + ctx.getAuditEventBuilder().setEventType(EventType.AFTER_QUERY) + .setDb(ClusterNamespace.getNameFromFullName(ctx.getDatabase())) + .setState(ctx.getState().toString()) + .setErrorCode(ctx.getState().getErrorCode() == null ? 0 : ctx.getState().getErrorCode().getCode()) + .setErrorMessage((ctx.getState().getErrorMessage() == null ? "" : + ctx.getState().getErrorMessage().replace("\n", " ").replace("\t", " "))) + .setQueryTime(elapseMs) + .setScanBytes(statistics == null ? 0 : statistics.getScanBytes()) + .setScanRows(statistics == null ? 0 : statistics.getScanRows()) + .setCpuTimeMs(statistics == null ? 0 : statistics.getCpuMs()) + .setPeakMemoryBytes(statistics == null ? 0 : statistics.getMaxPeakMemoryBytes()) + .setReturnRows(ctx.getReturnRows()) + .setStmtId(ctx.getStmtId()) + .setQueryId(ctx.queryId() == null ? "NaN" : DebugUtil.printId(ctx.queryId())) + .setTraceId(spanContext.isValid() ? spanContext.getTraceId() : "") + .setWorkloadGroup(ctx.getWorkloadGroupName()) + .setFuzzyVariables(!printFuzzyVariables ? "" : ctx.getSessionVariable().printFuzzyVariables()); + + if (ctx.getState().isQuery()) { + MetricRepo.COUNTER_QUERY_ALL.increase(1L); + MetricRepo.USER_COUNTER_QUERY_ALL.getOrAdd(ctx.getQualifiedUser()).increase(1L); + if (ctx.getState().getStateType() == MysqlStateType.ERR + && ctx.getState().getErrType() != QueryState.ErrType.ANALYSIS_ERR) { + // err query + MetricRepo.COUNTER_QUERY_ERR.increase(1L); + MetricRepo.USER_COUNTER_QUERY_ERR.getOrAdd(ctx.getQualifiedUser()).increase(1L); + } else if (ctx.getState().getStateType() == MysqlStateType.OK + || ctx.getState().getStateType() == MysqlStateType.EOF) { + // ok query + MetricRepo.HISTO_QUERY_LATENCY.update(elapseMs); + MetricRepo.USER_HISTO_QUERY_LATENCY.getOrAdd(ctx.getQualifiedUser()).update(elapseMs); + + if (elapseMs > Config.qe_slow_log_ms) { + String sqlDigest = DigestUtils.md5Hex(((Queriable) parsedStmt).toDigest()); + ctx.getAuditEventBuilder().setSqlDigest(sqlDigest); + } + } + ctx.getAuditEventBuilder().setIsQuery(true); + if (ctx.getQueryDetail() != null) { + ctx.getQueryDetail().setEventTime(endTime); + ctx.getQueryDetail().setEndTime(endTime); + ctx.getQueryDetail().setLatency(elapseMs); + ctx.getQueryDetail().setState(QueryDetail.QueryMemState.FINISHED); + QueryDetailQueue.addOrUpdateQueryDetail(ctx.getQueryDetail()); + ctx.setQueryDetail(null); + } + } else { + ctx.getAuditEventBuilder().setIsQuery(false); + } + ctx.getAuditEventBuilder().setIsNereids(ctx.getState().isNereids); + + ctx.getAuditEventBuilder().setFeIp(FrontendOptions.getLocalHostAddress()); + + // We put origin query stmt at the end of audit log, for parsing the log more convenient. + if (!ctx.getState().isQuery() && (parsedStmt != null && parsedStmt.needAuditEncryption())) { + ctx.getAuditEventBuilder().setStmt(parsedStmt.toSql()); + } else { + if (parsedStmt instanceof InsertStmt && !((InsertStmt) parsedStmt).needLoadManager() + && ((InsertStmt) parsedStmt).isValuesOrConstantSelect()) { + // INSERT INTO VALUES may be very long, so we only log at most 1K bytes. + int length = Math.min(1024, origStmt.length()); + ctx.getAuditEventBuilder().setStmt(origStmt.substring(0, length)); + } else { + ctx.getAuditEventBuilder().setStmt(origStmt); + } + } + if (!Env.getCurrentEnv().isMaster()) { + if (ctx.executor.isForwardToMaster()) { + ctx.getAuditEventBuilder().setState(ctx.executor.getProxyStatus()); + } + } + Env.getCurrentAuditEventProcessor().handleAuditEvent(ctx.getAuditEventBuilder().build()); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java index f50d8d1ae02330..afd95a50fe00d9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java @@ -40,7 +40,6 @@ import org.apache.doris.analysis.AlterRoutineLoadStmt; import org.apache.doris.analysis.AlterSqlBlockRuleStmt; import org.apache.doris.analysis.AlterSystemStmt; -import org.apache.doris.analysis.AlterTableStatsStmt; import org.apache.doris.analysis.AlterTableStmt; import org.apache.doris.analysis.AlterUserStmt; import org.apache.doris.analysis.AlterViewStmt; @@ -162,8 +161,6 @@ public static void execute(Env env, DdlStmt ddlStmt) throws Exception { env.createMaterializedView((CreateMaterializedViewStmt) ddlStmt); } else if (ddlStmt instanceof AlterTableStmt) { env.alterTable((AlterTableStmt) ddlStmt); - } else if (ddlStmt instanceof AlterTableStatsStmt) { - StatisticsRepository.alterTableStatistics((AlterTableStatsStmt) ddlStmt); } else if (ddlStmt instanceof AlterColumnStatsStmt) { StatisticsRepository.alterColumnStatistics((AlterColumnStatsStmt) ddlStmt); } else if (ddlStmt instanceof AlterViewStmt) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java b/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java new file mode 100644 index 00000000000000..c368533c53df72 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +public class InternalQueryExecutionException extends RuntimeException { + public InternalQueryExecutionException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index d42474965488ce..3451811445370a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -403,6 +403,8 @@ public class SessionVariable implements Serializable, Writable { public static final String TEST_QUERY_CACHE_HIT = "test_query_cache_hit"; + public static final String ENABLE_FULL_AUTO_ANALYZE = "enable_full_auto_analyze"; + public static final List DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -1142,6 +1144,24 @@ public void setMaxJoinNumberOfReorder(int maxJoinNumberOfReorder) { options = {"none", "sql_cache", "partition_cache"}) public String testQueryCacheHit = "none"; + @VariableMgr.VarAttr(name = ENABLE_FULL_AUTO_ANALYZE, + description = {"该参数控制是否开启自动收集", "Set false to disable auto analyze"}, + flag = VariableMgr.GLOBAL) + public boolean enableFullAutoAnalyze = true; + + @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_START_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", + description = {"该参数定义自动ANALYZE例程的开始时间", + "This parameter defines the start time for the automatic ANALYZE routine."}, + flag = VariableMgr.GLOBAL) + public String fullAutoAnalyzeStartTime = "00:00:00"; + + @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_END_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", + description = {"该参数定义自动ANALYZE例程的结束时间", + "This parameter defines the end time for the automatic ANALYZE routine."}, + flag = VariableMgr.GLOBAL) + public String fullAutoAnalyzeEndTime = "02:00:00"; + + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java index 06e8f4d392902e..f4f40047b755d7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java @@ -138,7 +138,6 @@ import org.apache.doris.catalog.TabletInvertedIndex; import org.apache.doris.catalog.TabletMeta; import org.apache.doris.catalog.View; -import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.clone.DynamicPartitionScheduler; import org.apache.doris.cluster.ClusterNamespace; @@ -196,7 +195,7 @@ import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.Histogram; import org.apache.doris.statistics.StatisticsRepository; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.statistics.query.QueryStatsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.Diagnoser; @@ -240,7 +239,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.Optional; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; @@ -2392,29 +2390,16 @@ private void handleShowDataSkew() throws AnalysisException { private void handleShowTableStats() { ShowTableStatsStmt showTableStatsStmt = (ShowTableStatsStmt) stmt; TableIf tableIf = showTableStatsStmt.getTable(); - long partitionId = showTableStatsStmt.getPartitionId(); - boolean showCache = showTableStatsStmt.isCached(); - try { - if (tableIf instanceof ExternalTable && showCache) { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - tableIf.getDatabase().getCatalog().getId(), - tableIf.getDatabase().getId(), - tableIf.getId()); - if (tableStatistics.isPresent()) { - resultSet = showTableStatsStmt.constructResultSet(tableStatistics.get()); - } else { - resultSet = showTableStatsStmt.constructResultSet(TableStatistic.UNKNOWN); - } - } else if (partitionId > 0) { - TableStatistic partStats = StatisticsRepository.fetchTableLevelOfPartStats(partitionId); - resultSet = showTableStatsStmt.constructResultSet(partStats); - } else { - TableStatistic tableStats = StatisticsRepository.fetchTableLevelStats(tableIf.getId()); - resultSet = showTableStatsStmt.constructResultSet(tableStats); - } - } catch (DdlException e) { - LOG.warn("Table statistics do not exist: {}", tableIf.getName()); - resultSet = showTableStatsStmt.constructResultSet(TableStatistic.UNKNOWN); + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(tableIf.getId()); + /* + HMSExternalTable table will fetch row count from HMS + or estimate with file size and schema if it's not analyzed. + tableStats == null means it's not analyzed, in this case show the estimated row count. + */ + if (tableStats == null && tableIf instanceof HMSExternalTable) { + resultSet = showTableStatsStmt.constructResultSet(tableIf.estimatedRowCount()); + } else { + resultSet = showTableStatsStmt.constructResultSet(tableStats); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index 197c5fc9088dd3..5d90008ff758b3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -142,8 +142,8 @@ import org.apache.doris.rewrite.mvrewrite.MVSelectFailedException; import org.apache.doris.rpc.RpcException; import org.apache.doris.service.FrontendOptions; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.statistics.util.InternalQueryBuffer; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.task.LoadEtlTask; import org.apache.doris.thrift.TFileFormatType; import org.apache.doris.thrift.TFileType; @@ -2490,7 +2490,8 @@ public List executeInternalQuery() { planner = new NereidsPlanner(statementContext); planner.plan(parsedStmt, context.getSessionVariable().toThrift()); } catch (Exception e) { - LOG.warn("fall back to legacy planner, because: {}", e.getMessage(), e); + LOG.warn("Arrow Flight SQL fall back to legacy planner, because: {}", + e.getMessage(), e); parsedStmt = null; planner = null; context.getState().setNereids(false); @@ -2502,10 +2503,9 @@ public List executeInternalQuery() { analyze(context.getSessionVariable().toThrift()); } } catch (Exception e) { - throw new RuntimeException("Failed to execute internal SQL. " - + Util.getRootCauseMessage(e) + " " + originStmt.toString(), e); + LOG.warn("Failed to run internal SQL: {}", originStmt, e); + throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e), e); } - planner.getFragments(); RowBatch batch; coord = new Coordinator(context, analyzer, planner, context.getStatsErrorEstimator()); profile.addExecutionProfile(coord.getExecutionProfile()); @@ -2513,8 +2513,7 @@ public List executeInternalQuery() { QeProcessorImpl.INSTANCE.registerQuery(context.queryId(), new QeProcessorImpl.QueryInfo(context, originStmt.originStmt, coord)); } catch (UserException e) { - throw new RuntimeException("Failed to execute internal SQL. " - + " " + Util.getRootCauseMessage(e) + originStmt.toString(), e); + throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e), e); } Span queryScheduleSpan = context.getTracer() @@ -2523,8 +2522,7 @@ public List executeInternalQuery() { coord.exec(); } catch (Exception e) { queryScheduleSpan.recordException(e); - throw new RuntimeException("Failed to execute internal SQL. " - + Util.getRootCauseMessage(e) + " " + originStmt.toString(), e); + throw new InternalQueryExecutionException(e.getMessage() + Util.getRootCauseMessage(e), e); } finally { queryScheduleSpan.end(); } @@ -2541,21 +2539,19 @@ public List executeInternalQuery() { } } catch (Exception e) { fetchResultSpan.recordException(e); - throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e) + " " - + originStmt.toString(), e); + throw new RuntimeException("Failed to fetch internal SQL result. " + Util.getRootCauseMessage(e), e); } finally { fetchResultSpan.end(); } } finally { + AuditLogHelper.logAuditLog(context, originStmt.toString(), parsedStmt, getQueryStatisticsForAuditLog(), + true); QeProcessorImpl.INSTANCE.unregisterQuery(context.queryId()); } } private List convertResultBatchToResultRows(TResultBatch batch) { List columns = parsedStmt.getColLabels(); - List types = parsedStmt.getResultExprs().stream() - .map(e -> e.getType().getPrimitiveType()) - .collect(Collectors.toList()); List resultRows = new ArrayList<>(); List rows = batch.getRows(); for (ByteBuffer buffer : rows) { @@ -2566,8 +2562,7 @@ private List convertResultBatchToResultRows(TResultBatch batch) { String value = queryBuffer.readStringWithLength(); values.add(value); } - - ResultRow resultRow = new ResultRow(columns, types, values); + ResultRow resultRow = new ResultRow(values); resultRows.add(resultRow); } return resultRows; @@ -2590,6 +2585,14 @@ public void setProxyResultSet(ShowResultSet proxyResultSet) { this.proxyResultSet = proxyResultSet; } + public ConnectContext getContext() { + return context; + } + + public OriginStatement getOriginStmt() { + return originStmt; + } + public String getOriginStmtInString() { if (originStmt != null && originStmt.originStmt != null) { return originStmt.originStmt; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index ee39582aac4df0..4df6f9cf7afe16 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -22,19 +22,21 @@ import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.persist.gson.GsonUtils; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.gson.Gson; import com.google.gson.annotations.SerializedName; import com.google.gson.reflect.TypeToken; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.util.CronExpression; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.lang.reflect.Type; +import java.text.ParseException; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -46,6 +48,7 @@ public class AnalysisInfo implements Writable { private static final Logger LOG = LogManager.getLogger(AnalysisInfo.class); + // TODO: useless, remove it later public enum AnalysisMode { INCREMENTAL, FULL @@ -66,10 +69,12 @@ public enum JobType { // submit by user directly MANUAL, // submit by system automatically - SYSTEM + SYSTEM; } public enum ScheduleType { + // Job created by AutoCollector is also `ONCE` type, this is because it runs once only and should be removed + // when its information is expired ONCE, PERIOD, AUTOMATIC @@ -95,6 +100,7 @@ public enum ScheduleType { @SerializedName("tblName") public final String tblName; + // TODO: Map here is wired, List is enough @SerializedName("colToPartitions") public final Map> colToPartitions; @@ -123,7 +129,7 @@ public enum ScheduleType { public final int samplePercent; @SerializedName("sampleRows") - public final int sampleRows; + public final long sampleRows; @SerializedName("maxBucketNum") public final int maxBucketNum; @@ -151,24 +157,39 @@ public enum ScheduleType { // True means this task is a table level task for external table. // This kind of task is mainly to collect the number of rows of a table. @SerializedName("externalTableLevelTask") - public boolean externalTableLevelTask; + public final boolean externalTableLevelTask; @SerializedName("partitionOnly") - public boolean partitionOnly; + public final boolean partitionOnly; @SerializedName("samplingPartition") - public boolean samplingPartition; + public final boolean samplingPartition; + + @SerializedName("isAllPartition") + public final boolean isAllPartition; + + @SerializedName("partitionCount") + public final long partitionCount; // For serialize @SerializedName("cronExpr") public String cronExprStr; + @SerializedName("progress") + public String progress; + + public CronExpression cronExpression; + + @SerializedName("forceFull") + public final boolean forceFull; + public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogName, String dbName, String tblName, Map> colToPartitions, Set partitionNames, String colName, Long indexId, JobType jobType, AnalysisMode analysisMode, AnalysisMethod analysisMethod, AnalysisType analysisType, - int samplePercent, int sampleRows, int maxBucketNum, long periodTimeInMs, String message, + int samplePercent, long sampleRows, int maxBucketNum, long periodTimeInMs, String message, long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, - boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition) { + boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, + boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -195,6 +216,13 @@ public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogN this.externalTableLevelTask = isExternalTableLevelTask; this.partitionOnly = partitionOnly; this.samplingPartition = samplingPartition; + this.isAllPartition = isAllPartition; + this.partitionCount = partitionCount; + this.cronExpression = cronExpression; + if (cronExpression != null) { + this.cronExprStr = cronExpression.getCronExpression(); + } + this.forceFull = forceFull; } @Override @@ -205,11 +233,11 @@ public String toString() { sj.add("DBName: " + dbName); sj.add("TableName: " + tblName); sj.add("ColumnName: " + colName); - sj.add("TaskType: " + analysisType.toString()); - sj.add("TaskMode: " + analysisMode.toString()); - sj.add("TaskMethod: " + analysisMethod.toString()); + sj.add("TaskType: " + analysisType); + sj.add("TaskMode: " + analysisMode); + sj.add("TaskMethod: " + analysisMethod); sj.add("Message: " + message); - sj.add("CurrentState: " + state.toString()); + sj.add("CurrentState: " + state); if (samplePercent > 0) { sj.add("SamplePercent: " + samplePercent); } @@ -231,6 +259,10 @@ public String toString() { if (periodTimeInMs > 0) { sj.add("periodTimeInMs: " + StatisticsUtil.getReadableTime(periodTimeInMs)); } + if (StringUtils.isNotEmpty(cronExprStr)) { + sj.add("cronExpr: " + cronExprStr); + } + sj.add("forceFull: " + forceFull); return sj.toString(); } @@ -246,60 +278,6 @@ public void addTaskId(long taskId) { taskIds.add(taskId); } - // TODO: use thrift - public static AnalysisInfo fromResultRow(ResultRow resultRow) { - try { - AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder(); - long jobId = Long.parseLong(resultRow.getColumnValue("job_id")); - analysisInfoBuilder.setJobId(jobId); - long taskId = Long.parseLong(resultRow.getColumnValue("task_id")); - analysisInfoBuilder.setTaskId(taskId); - String catalogName = resultRow.getColumnValue("catalog_name"); - analysisInfoBuilder.setCatalogName(catalogName); - String dbName = resultRow.getColumnValue("db_name"); - analysisInfoBuilder.setDbName(dbName); - String tblName = resultRow.getColumnValue("tbl_name"); - analysisInfoBuilder.setTblName(tblName); - String colName = resultRow.getColumnValue("col_name"); - analysisInfoBuilder.setColName(colName); - long indexId = Long.parseLong(resultRow.getColumnValue("index_id")); - analysisInfoBuilder.setIndexId(indexId); - String partitionNames = resultRow.getColumnValue("col_partitions"); - Map> colToPartitions = getColToPartition(partitionNames); - analysisInfoBuilder.setColToPartitions(colToPartitions); - String jobType = resultRow.getColumnValue("job_type"); - analysisInfoBuilder.setJobType(JobType.valueOf(jobType)); - String analysisType = resultRow.getColumnValue("analysis_type"); - analysisInfoBuilder.setAnalysisType(AnalysisType.valueOf(analysisType)); - String analysisMode = resultRow.getColumnValue("analysis_mode"); - analysisInfoBuilder.setAnalysisMode(AnalysisMode.valueOf(analysisMode)); - String analysisMethod = resultRow.getColumnValue("analysis_method"); - analysisInfoBuilder.setAnalysisMethod(AnalysisMethod.valueOf(analysisMethod)); - String scheduleType = resultRow.getColumnValue("schedule_type"); - analysisInfoBuilder.setScheduleType(ScheduleType.valueOf(scheduleType)); - String state = resultRow.getColumnValue("state"); - analysisInfoBuilder.setState(AnalysisState.valueOf(state)); - String samplePercent = resultRow.getColumnValue("sample_percent"); - analysisInfoBuilder.setSamplePercent(StatisticsUtil.convertStrToInt(samplePercent)); - String sampleRows = resultRow.getColumnValue("sample_rows"); - analysisInfoBuilder.setSampleRows(StatisticsUtil.convertStrToInt(sampleRows)); - String maxBucketNum = resultRow.getColumnValue("max_bucket_num"); - analysisInfoBuilder.setMaxBucketNum(StatisticsUtil.convertStrToInt(maxBucketNum)); - String periodTimeInMs = resultRow.getColumnValue("period_time_in_ms"); - analysisInfoBuilder.setPeriodTimeInMs(StatisticsUtil.convertStrToInt(periodTimeInMs)); - String lastExecTimeInMs = resultRow.getColumnValue("last_exec_time_in_ms"); - analysisInfoBuilder.setLastExecTimeInMs(StatisticsUtil.convertStrToLong(lastExecTimeInMs)); - String timeCostInMs = resultRow.getColumnValue("time_cost_in_ms"); - analysisInfoBuilder.setTimeCostInMs(StatisticsUtil.convertStrToLong(timeCostInMs)); - String message = resultRow.getColumnValue("message"); - analysisInfoBuilder.setMessage(message); - return analysisInfoBuilder.build(); - } catch (Exception e) { - LOG.warn("Failed to deserialize analysis task info.", e); - return null; - } - } - public String getColToPartitionStr() { if (colToPartitions == null || colToPartitions.isEmpty()) { return ""; @@ -362,7 +340,15 @@ public static AnalysisInfo read(DataInput dataInput) throws IOException { return analysisInfoBuilder.build(); } else { String json = Text.readString(dataInput); - return GsonUtils.GSON.fromJson(json, AnalysisInfo.class); + AnalysisInfo analysisInfo = GsonUtils.GSON.fromJson(json, AnalysisInfo.class); + if (analysisInfo.cronExprStr != null) { + try { + analysisInfo.cronExpression = new CronExpression(analysisInfo.cronExprStr); + } catch (ParseException e) { + LOG.warn("Cron expression of job is invalid, there is a bug", e); + } + } + return analysisInfo; } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index 2fd0e25d727cc1..b6d9aafe853fdc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -23,6 +23,8 @@ import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.logging.log4j.core.util.CronExpression; + import java.util.List; import java.util.Map; import java.util.Set; @@ -44,7 +46,7 @@ public class AnalysisInfoBuilder { private AnalysisType analysisType; private int maxBucketNum; private int samplePercent; - private int sampleRows; + private long sampleRows; private long periodTimeInMs; private long lastExecTimeInMs; private long timeCostInMs; @@ -54,6 +56,10 @@ public class AnalysisInfoBuilder { private boolean externalTableLevelTask; private boolean partitionOnly; private boolean samplingPartition; + private boolean isAllPartition; + private long partitionCount; + private CronExpression cronExpression; + private boolean forceFull; public AnalysisInfoBuilder() { } @@ -85,6 +91,10 @@ public AnalysisInfoBuilder(AnalysisInfo info) { externalTableLevelTask = info.externalTableLevelTask; partitionOnly = info.partitionOnly; samplingPartition = info.samplingPartition; + isAllPartition = info.isAllPartition; + partitionCount = info.partitionCount; + cronExpression = info.cronExpression; + forceFull = info.forceFull; } public AnalysisInfoBuilder setJobId(long jobId) { @@ -167,7 +177,7 @@ public AnalysisInfoBuilder setSamplePercent(int samplePercent) { return this; } - public AnalysisInfoBuilder setSampleRows(int sampleRows) { + public AnalysisInfoBuilder setSampleRows(long sampleRows) { this.sampleRows = sampleRows; return this; } @@ -217,11 +227,30 @@ public AnalysisInfoBuilder setSamplingPartition(boolean samplingPartition) { return this; } + public AnalysisInfoBuilder setAllPartition(boolean isAllPartition) { + this.isAllPartition = isAllPartition; + return this; + } + + public AnalysisInfoBuilder setPartitionCount(long partitionCount) { + this.partitionCount = partitionCount; + return this; + } + + public void setCronExpression(CronExpression cronExpression) { + this.cronExpression = cronExpression; + } + + public void setForceFull(boolean forceFull) { + this.forceFull = forceFull; + } + public AnalysisInfo build() { return new AnalysisInfo(jobId, taskId, taskIds, catalogName, dbName, tblName, colToPartitions, partitionNames, colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, - externalTableLevelTask, partitionOnly, samplingPartition); + externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, + cronExpression, forceFull); } public AnalysisInfoBuilder copy() { @@ -248,6 +277,10 @@ public AnalysisInfoBuilder copy() { .setTimeCostInMs(timeCostInMs) .setState(state) .setScheduleType(scheduleType) - .setExternalTableLevelTask(externalTableLevelTask); + .setExternalTableLevelTask(externalTableLevelTask) + .setSamplingPartition(samplingPartition) + .setPartitionOnly(partitionOnly) + .setAllPartition(isAllPartition) + .setPartitionCount(partitionCount); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 5ac0c0cd5248c1..3eebb1ec9aa24d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -18,6 +18,7 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.AnalyzeDBStmt; +import org.apache.doris.analysis.AnalyzeProperties; import org.apache.doris.analysis.AnalyzeStmt; import org.apache.doris.analysis.AnalyzeTblStmt; import org.apache.doris.analysis.DropAnalyzeJobStmt; @@ -28,12 +29,9 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.MaterializedIndexMeta; -import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.TableIf; -import org.apache.doris.catalog.TableIf.TableType; import org.apache.doris.catalog.View; import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; @@ -42,12 +40,14 @@ import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; +import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.common.util.Daemon; import org.apache.doris.common.util.Util; -import org.apache.doris.datasource.CatalogIf; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.persist.AnalyzeDeletionLog; +import org.apache.doris.persist.TableStatsDeletionLog; +import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSet; import org.apache.doris.qe.ShowResultSetMetaData; @@ -56,25 +56,31 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.SimpleQueue; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import com.google.common.reflect.TypeToken; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.util.CronExpression; import org.jetbrains.annotations.Nullable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -93,26 +99,141 @@ public class AnalysisManager extends Daemon implements Writable { - public AnalysisTaskScheduler taskScheduler; - private static final Logger LOG = LogManager.getLogger(AnalysisManager.class); - private ConcurrentMap> analysisJobIdToTaskMap = new ConcurrentHashMap<>(); + // Tracking running manually submitted async tasks, keep in mem only + protected final ConcurrentMap> analysisJobIdToTaskMap = new ConcurrentHashMap<>(); private StatisticsCache statisticsCache; private AnalysisTaskExecutor taskExecutor; + // Store task information in metadata. private final Map analysisTaskInfoMap = Collections.synchronizedMap(new TreeMap<>()); + + // Store job information in metadata private final Map analysisJobInfoMap = Collections.synchronizedMap(new TreeMap<>()); + // Tracking system submitted job, keep in mem only + protected final Map systemJobInfoMap = new ConcurrentHashMap<>(); + + // Tracking and control sync analyze tasks, keep in mem only private final ConcurrentMap ctxToSyncTask = new ConcurrentHashMap<>(); + private final Map idToTblStats = new ConcurrentHashMap<>(); + + protected SimpleQueue autoJobs = createSimpleQueue(null, this); + + private final Function userJobStatusUpdater = w -> { + AnalysisInfo info = w.info; + AnalysisState taskState = w.taskState; + String message = w.message; + long time = w.time; + if (analysisJobIdToTaskMap.get(info.jobId) == null) { + return null; + } + info.state = taskState; + info.message = message; + // Update the task cost time when task finished or failed. And only log the final state. + if (taskState.equals(AnalysisState.FINISHED) || taskState.equals(AnalysisState.FAILED)) { + info.timeCostInMs = time - info.lastExecTimeInMs; + info.lastExecTimeInMs = time; + logCreateAnalysisTask(info); + } + info.lastExecTimeInMs = time; + AnalysisInfo job = analysisJobInfoMap.get(info.jobId); + // Job may get deleted during execution. + if (job == null) { + return null; + } + // Synchronize the job state change in job level. + synchronized (job) { + job.lastExecTimeInMs = time; + // Set the job state to RUNNING when its first task becomes RUNNING. + if (info.state.equals(AnalysisState.RUNNING) && job.state.equals(AnalysisState.PENDING)) { + job.state = AnalysisState.RUNNING; + replayCreateAnalysisJob(job); + } + boolean allFinished = true; + boolean hasFailure = false; + for (BaseAnalysisTask task : analysisJobIdToTaskMap.get(info.jobId).values()) { + AnalysisInfo taskInfo = task.info; + if (taskInfo.state.equals(AnalysisState.RUNNING) || taskInfo.state.equals(AnalysisState.PENDING)) { + allFinished = false; + break; + } + if (taskInfo.state.equals(AnalysisState.FAILED)) { + hasFailure = true; + } + } + if (allFinished) { + if (hasFailure) { + job.state = AnalysisState.FAILED; + } else { + job.state = AnalysisState.FINISHED; + try { + updateTableStats(job); + } catch (Throwable e) { + LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); + } + } + logCreateAnalysisJob(job); + analysisJobIdToTaskMap.remove(job.jobId); + } + } + return null; + }; + + private final String progressDisplayTemplate = "%d Finished | %d Failed | %d In Progress | %d Total"; + + protected final Function systemJobStatusUpdater = w -> { + AnalysisInfo info = w.info; + info.state = w.taskState; + info.message = w.message; + AnalysisInfo job = systemJobInfoMap.get(info.jobId); + if (job == null) { + return null; + } + int failedCount = 0; + StringJoiner reason = new StringJoiner(", "); + Map taskMap = analysisJobIdToTaskMap.get(info.jobId); + for (BaseAnalysisTask task : taskMap.values()) { + if (task.info.state.equals(AnalysisState.RUNNING) || task.info.state.equals(AnalysisState.PENDING)) { + return null; + } + if (task.info.state.equals(AnalysisState.FAILED)) { + failedCount++; + reason.add(task.info.message); + } + } + try { + updateTableStats(job); + } catch (Throwable e) { + LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); + } finally { + job.lastExecTimeInMs = System.currentTimeMillis(); + job.message = reason.toString(); + job.progress = String.format(progressDisplayTemplate, + taskMap.size() - failedCount, failedCount, 0, taskMap.size()); + if (failedCount > 0) { + job.message = reason.toString(); + job.state = AnalysisState.FAILED; + } else { + job.state = AnalysisState.FINISHED; + } + autoJobs.offer(job); + systemJobInfoMap.remove(info.jobId); + } + return null; + }; + + private final Function[] updaters = + new Function[] {userJobStatusUpdater, systemJobStatusUpdater}; + public AnalysisManager() { super(TimeUnit.SECONDS.toMillis(StatisticConstants.ANALYZE_MANAGER_INTERVAL_IN_SECS)); if (!Env.isCheckpointThread()) { - this.taskScheduler = new AnalysisTaskScheduler(); - this.taskExecutor = new AnalysisTaskExecutor(taskScheduler); + this.taskExecutor = new AnalysisTaskExecutor(Config.statistics_simultaneously_running_task_num); this.statisticsCache = new StatisticsCache(); taskExecutor.start(); } @@ -124,7 +245,7 @@ protected void runOneCycle() { } private void clear() { - clearMeta(analysisJobInfoMap, (a) -> + clearExpiredAnalysisInfo(analysisJobInfoMap, (a) -> a.scheduleType.equals(ScheduleType.ONCE) && System.currentTimeMillis() - a.lastExecTimeInMs > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), @@ -132,7 +253,7 @@ private void clear() { Env.getCurrentEnv().getEditLog().logDeleteAnalysisJob(new AnalyzeDeletionLog(id)); return null; }); - clearMeta(analysisTaskInfoMap, (a) -> System.currentTimeMillis() - a.lastExecTimeInMs + clearExpiredAnalysisInfo(analysisTaskInfoMap, (a) -> System.currentTimeMillis() - a.lastExecTimeInMs > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), (id) -> { Env.getCurrentEnv().getEditLog().logDeleteAnalysisTask(new AnalyzeDeletionLog(id)); @@ -140,7 +261,7 @@ private void clear() { }); } - private void clearMeta(Map infoMap, Predicate isExpired, + private void clearExpiredAnalysisInfo(Map infoMap, Predicate isExpired, Function writeLog) { synchronized (infoMap) { List expired = new ArrayList<>(); @@ -161,6 +282,9 @@ public StatisticsCache getStatisticsCache() { } public void createAnalyze(AnalyzeStmt analyzeStmt, boolean proxy) throws DdlException { + if (!StatisticsUtil.statsTblAvailable() && !FeConstants.runningUnitTest) { + throw new DdlException("Stats table not available, please make sure your cluster status is normal"); + } if (analyzeStmt instanceof AnalyzeDBStmt) { createAnalysisJobs((AnalyzeDBStmt) analyzeStmt, proxy); } else if (analyzeStmt instanceof AnalyzeTblStmt) { @@ -170,38 +294,53 @@ public void createAnalyze(AnalyzeStmt analyzeStmt, boolean proxy) throws DdlExce public void createAnalysisJobs(AnalyzeDBStmt analyzeDBStmt, boolean proxy) throws DdlException { DatabaseIf db = analyzeDBStmt.getDb(); + // Using auto analyzer if user specifies. + if (analyzeDBStmt.getAnalyzeProperties().getProperties().containsKey("use.auto.analyzer")) { + Env.getCurrentEnv().getStatisticsAutoCollector().analyzeDb(db); + return; + } + List analysisInfos = buildAnalysisInfosForDB(db, analyzeDBStmt.getAnalyzeProperties()); + if (!analyzeDBStmt.isSync()) { + sendJobId(analysisInfos, proxy); + } + } + + public List buildAnalysisInfosForDB(DatabaseIf db, AnalyzeProperties analyzeProperties) { + db.readLock(); List tbls = db.getTables(); List analysisInfos = new ArrayList<>(); - db.readLock(); try { List analyzeStmts = new ArrayList<>(); for (TableIf table : tbls) { if (table instanceof View) { continue; } - TableName tableName = new TableName(analyzeDBStmt.getCtlIf().getName(), db.getFullName(), + TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), table.getName()); // columnNames null means to add all visitable columns. // Will get all the visible columns in analyzeTblStmt.check() - AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(analyzeDBStmt.getAnalyzeProperties(), tableName, + AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(analyzeProperties, tableName, null, db.getId(), table); try { analyzeTblStmt.check(); } catch (AnalysisException analysisException) { - throw new DdlException(analysisException.getMessage(), analysisException); + LOG.warn("Failed to build analyze job: {}", + analysisException.getMessage(), analysisException); } analyzeStmts.add(analyzeTblStmt); } for (AnalyzeTblStmt analyzeTblStmt : analyzeStmts) { - analysisInfos.add(buildAndAssignJob(analyzeTblStmt)); - } - if (!analyzeDBStmt.isSync()) { - sendJobId(analysisInfos, proxy); + try { + analysisInfos.add(buildAndAssignJob(analyzeTblStmt)); + } catch (DdlException e) { + LOG.warn("Failed to build analyze job: {}", + e.getMessage(), e); + } } } finally { db.readUnlock(); } - + return analysisInfos; } // Each analyze stmt corresponding to an analysis job. @@ -214,11 +353,8 @@ public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlExce } @Nullable - private AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException { - if (!StatisticsUtil.statsTblAvailable() && !FeConstants.runningUnitTest) { - throw new DdlException("Stats table not available, please make sure your cluster status is normal"); - } - + @VisibleForTesting + protected AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException { AnalysisInfo jobInfo = buildAnalysisJobInfo(stmt); if (jobInfo.colToPartitions.isEmpty()) { // No statistics need to be collected or updated @@ -228,66 +364,44 @@ private AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException boolean isSync = stmt.isSync(); Map analysisTaskInfos = new HashMap<>(); createTaskForEachColumns(jobInfo, analysisTaskInfos, isSync); - createTaskForMVIdx(jobInfo, analysisTaskInfos, isSync); - if (stmt.isAllColumns()) { - createTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); - } - if (!isSync) { - persistAnalysisJob(jobInfo); - analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); + if (!jobInfo.partitionOnly && stmt.isAllColumns() + && StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); } - if (!isSync) { - try { - updateTableStats(jobInfo); - } catch (Throwable e) { - throw new DdlException("Failed to update Table statistics"); - } - } - if (isSync) { syncExecute(analysisTaskInfos.values()); + updateTableStats(jobInfo); return null; } - - analysisTaskInfos.values().forEach(taskScheduler::schedule); - return jobInfo; - } - - // Analysis job created by the system - public void createAnalysisJob(AnalysisInfo info) throws DdlException { - AnalysisInfo jobInfo = buildAnalysisJobInfo(info); - if (jobInfo.colToPartitions.isEmpty()) { - // No statistics need to be collected or updated - return; - } - - Map analysisTaskInfos = new HashMap<>(); - createTaskForEachColumns(jobInfo, analysisTaskInfos, false); - createTaskForMVIdx(jobInfo, analysisTaskInfos, false); - if (!jobInfo.jobType.equals(JobType.SYSTEM)) { - persistAnalysisJob(jobInfo); - analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); + recordAnalysisJob(jobInfo); + analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); + // TODO: maybe we should update table stats only when all task succeeded. + updateTableStats(jobInfo); + if (!jobInfo.scheduleType.equals(ScheduleType.PERIOD)) { + analysisTaskInfos.values().forEach(taskExecutor::submitTask); } - - analysisTaskInfos.values().forEach(taskScheduler::schedule); + return jobInfo; } private void sendJobId(List analysisInfos, boolean proxy) { List columns = new ArrayList<>(); + columns.add(new Column("Job_Id", ScalarType.createVarchar(19))); columns.add(new Column("Catalog_Name", ScalarType.createVarchar(1024))); columns.add(new Column("DB_Name", ScalarType.createVarchar(1024))); columns.add(new Column("Table_Name", ScalarType.createVarchar(1024))); columns.add(new Column("Columns", ScalarType.createVarchar(1024))); - columns.add(new Column("Job_Id", ScalarType.createVarchar(19))); ShowResultSetMetaData commonResultSetMetaData = new ShowResultSetMetaData(columns); List> resultRows = new ArrayList<>(); for (AnalysisInfo analysisInfo : analysisInfos) { + if (analysisInfo == null) { + continue; + } List row = new ArrayList<>(); + row.add(String.valueOf(analysisInfo.jobId)); row.add(analysisInfo.catalogName); row.add(analysisInfo.dbName); row.add(analysisInfo.tblName); row.add(analysisInfo.colName); - row.add(String.valueOf(analysisInfo.jobId)); resultRows.add(row); } ShowResultSet commonResultSet = new ShowResultSet(commonResultSetMetaData, resultRows); @@ -317,13 +431,13 @@ private void sendJobId(List analysisInfos, boolean proxy) { * TODO Supports incremental collection of statistics from materialized views */ private Map> validateAndGetPartitions(TableIf table, Set columnNames, - Set partitionNames, AnalysisType analysisType, AnalysisMode analysisMode) throws DdlException { + Set partitionNames, AnalysisType analysisType) throws DdlException { long tableId = table.getId(); Map> columnToPartitions = columnNames.stream() .collect(Collectors.toMap( columnName -> columnName, - columnName -> new HashSet<>(partitionNames) + columnName -> new HashSet<>(partitionNames == null ? Collections.emptySet() : partitionNames) )); if (analysisType == AnalysisType.HISTOGRAM) { @@ -340,7 +454,7 @@ private Map> validateAndGetPartitions(TableIf table, Set> existColAndPartsForStats = StatisticsRepository + Map> existColAndPartsForStats = StatisticsRepository .fetchColAndPartsForStats(tableId); if (existColAndPartsForStats.isEmpty()) { @@ -348,42 +462,30 @@ private Map> validateAndGetPartitions(TableIf table, Set existPartIdsForStats = new HashSet<>(); + Set existPartIdsForStats = new HashSet<>(); existColAndPartsForStats.values().forEach(existPartIdsForStats::addAll); - Map idToPartition = StatisticsUtil.getPartitionIdToName(table); + Set idToPartition = StatisticsUtil.getPartitionIds(table); // Get an invalid set of partitions (those partitions were deleted) - Set invalidPartIds = existPartIdsForStats.stream() - .filter(id -> !idToPartition.containsKey(id)).collect(Collectors.toSet()); + Set invalidPartIds = existPartIdsForStats.stream() + .filter(id -> !idToPartition.contains(id)).collect(Collectors.toSet()); if (!invalidPartIds.isEmpty()) { // Delete invalid partition statistics to avoid affecting table statistics StatisticsRepository.dropStatistics(invalidPartIds); } - if (analysisMode == AnalysisMode.INCREMENTAL && analysisType == AnalysisType.FUNDAMENTALS) { - existColAndPartsForStats.values().forEach(partIds -> partIds.removeAll(invalidPartIds)); - // In incremental collection mode, just collect the uncollected partition statistics - existColAndPartsForStats.forEach((columnName, partitionIds) -> { - Set existPartitions = partitionIds.stream() - .map(idToPartition::get) - .collect(Collectors.toSet()); - columnToPartitions.computeIfPresent(columnName, (colName, partNames) -> { - partNames.removeAll(existPartitions); - return partNames; - }); - }); - if (invalidPartIds.isEmpty()) { - // There is no invalid statistics, so there is no need to update table statistics, - // remove columns that do not require re-collection of statistics - columnToPartitions.entrySet().removeIf(entry -> entry.getValue().isEmpty()); - } + if (analysisType == AnalysisType.FUNDAMENTALS) { + return table.findReAnalyzeNeededPartitions(); } return columnToPartitions; } - private AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { - AnalysisInfoBuilder taskInfoBuilder = new AnalysisInfoBuilder(); + // Make sure colName of job has all the column as this AnalyzeStmt specified, no matter whether it will be analyzed + // or not. + @VisibleForTesting + public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { + AnalysisInfoBuilder infoBuilder = new AnalysisInfoBuilder(); long jobId = Env.getCurrentEnv().getNextId(); String catalogName = stmt.getCatalogName(); String db = stmt.getDBName(); @@ -395,140 +497,75 @@ private AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlExcepti Set partitionNames = stmt.getPartitionNames(); boolean partitionOnly = stmt.isPartitionOnly(); boolean isSamplingPartition = stmt.isSamplingPartition(); + boolean isAllPartition = stmt.isAllPartitions(); + long partitionCount = stmt.getPartitionCount(); int samplePercent = stmt.getSamplePercent(); int sampleRows = stmt.getSampleRows(); AnalysisType analysisType = stmt.getAnalysisType(); AnalysisMode analysisMode = stmt.getAnalysisMode(); AnalysisMethod analysisMethod = stmt.getAnalysisMethod(); ScheduleType scheduleType = stmt.getScheduleType(); + CronExpression cronExpression = stmt.getCron(); - taskInfoBuilder.setJobId(jobId); - taskInfoBuilder.setCatalogName(catalogName); - taskInfoBuilder.setDbName(db); - taskInfoBuilder.setTblName(tblName); + infoBuilder.setJobId(jobId); + infoBuilder.setCatalogName(catalogName); + infoBuilder.setDbName(db); + infoBuilder.setTblName(tblName); + // TODO: Refactor later, DON'T MODIFY IT RIGHT NOW StringJoiner stringJoiner = new StringJoiner(",", "[", "]"); for (String colName : columnNames) { stringJoiner.add(colName); } - taskInfoBuilder.setColName(stringJoiner.toString()); - taskInfoBuilder.setPartitionNames(partitionNames); - taskInfoBuilder.setPartitionOnly(partitionOnly); - taskInfoBuilder.setSamplingPartition(isSamplingPartition); - taskInfoBuilder.setJobType(JobType.MANUAL); - taskInfoBuilder.setState(AnalysisState.PENDING); - taskInfoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); - taskInfoBuilder.setAnalysisType(analysisType); - taskInfoBuilder.setAnalysisMode(analysisMode); - taskInfoBuilder.setAnalysisMethod(analysisMethod); - taskInfoBuilder.setScheduleType(scheduleType); - taskInfoBuilder.setLastExecTimeInMs(0); - + infoBuilder.setColName(stringJoiner.toString()); + infoBuilder.setPartitionNames(partitionNames); + infoBuilder.setPartitionOnly(partitionOnly); + infoBuilder.setSamplingPartition(isSamplingPartition); + infoBuilder.setAllPartition(isAllPartition); + infoBuilder.setPartitionCount(partitionCount); + infoBuilder.setJobType(JobType.MANUAL); + infoBuilder.setState(AnalysisState.PENDING); + infoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); + infoBuilder.setAnalysisType(analysisType); + infoBuilder.setAnalysisMode(analysisMode); + infoBuilder.setAnalysisMethod(analysisMethod); + infoBuilder.setScheduleType(scheduleType); + infoBuilder.setLastExecTimeInMs(0); + infoBuilder.setCronExpression(cronExpression); + infoBuilder.setForceFull(stmt.forceFull()); if (analysisMethod == AnalysisMethod.SAMPLE) { - taskInfoBuilder.setSamplePercent(samplePercent); - taskInfoBuilder.setSampleRows(sampleRows); + infoBuilder.setSamplePercent(samplePercent); + infoBuilder.setSampleRows(sampleRows); } if (analysisType == AnalysisType.HISTOGRAM) { int numBuckets = stmt.getNumBuckets(); int maxBucketNum = numBuckets > 0 ? numBuckets : StatisticConstants.HISTOGRAM_MAX_BUCKET_NUM; - taskInfoBuilder.setMaxBucketNum(maxBucketNum); + infoBuilder.setMaxBucketNum(maxBucketNum); } - if (scheduleType == ScheduleType.PERIOD) { - long periodTimeInMs = stmt.getPeriodTimeInMs(); - taskInfoBuilder.setPeriodTimeInMs(periodTimeInMs); - } + long periodTimeInMs = stmt.getPeriodTimeInMs(); + infoBuilder.setPeriodTimeInMs(periodTimeInMs); Map> colToPartitions = validateAndGetPartitions(table, columnNames, - partitionNames, analysisType, analysisMode); - taskInfoBuilder.setColToPartitions(colToPartitions); - taskInfoBuilder.setTaskIds(Lists.newArrayList()); - - return taskInfoBuilder.build(); - } - - private AnalysisInfo buildAnalysisJobInfo(AnalysisInfo jobInfo) { - AnalysisInfoBuilder taskInfoBuilder = new AnalysisInfoBuilder(); - taskInfoBuilder.setJobId(jobInfo.jobId); - taskInfoBuilder.setCatalogName(jobInfo.catalogName); - taskInfoBuilder.setDbName(jobInfo.dbName); - taskInfoBuilder.setTblName(jobInfo.tblName); - taskInfoBuilder.setJobType(JobType.SYSTEM); - taskInfoBuilder.setState(AnalysisState.PENDING); - taskInfoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); - taskInfoBuilder.setAnalysisType(jobInfo.analysisType); - taskInfoBuilder.setAnalysisMode(jobInfo.analysisMode); - taskInfoBuilder.setAnalysisMethod(jobInfo.analysisMethod); - taskInfoBuilder.setScheduleType(jobInfo.scheduleType); - taskInfoBuilder.setSamplePercent(jobInfo.samplePercent); - taskInfoBuilder.setSampleRows(jobInfo.sampleRows); - taskInfoBuilder.setMaxBucketNum(jobInfo.maxBucketNum); - taskInfoBuilder.setPeriodTimeInMs(jobInfo.periodTimeInMs); - taskInfoBuilder.setLastExecTimeInMs(jobInfo.lastExecTimeInMs); - try { - TableIf table = StatisticsUtil - .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - Map> colToPartitions = validateAndGetPartitions(table, jobInfo.colToPartitions.keySet(), - jobInfo.partitionNames, jobInfo.analysisType, jobInfo.analysisMode); - taskInfoBuilder.setColToPartitions(colToPartitions); - } catch (Throwable e) { - throw new RuntimeException(e); - } - return taskInfoBuilder.build(); + partitionNames, analysisType); + infoBuilder.setColToPartitions(colToPartitions); + infoBuilder.setTaskIds(Lists.newArrayList()); + + return infoBuilder.build(); } - private void persistAnalysisJob(AnalysisInfo jobInfo) throws DdlException { + @VisibleForTesting + public void recordAnalysisJob(AnalysisInfo jobInfo) throws DdlException { if (jobInfo.scheduleType == ScheduleType.PERIOD && jobInfo.lastExecTimeInMs > 0) { return; } AnalysisInfoBuilder jobInfoBuilder = new AnalysisInfoBuilder(jobInfo); AnalysisInfo analysisInfo = jobInfoBuilder.setTaskId(-1).build(); - logCreateAnalysisJob(analysisInfo); - } - - private void createTaskForMVIdx(AnalysisInfo jobInfo, Map analysisTasks, - boolean isSync) throws DdlException { - TableIf table; - try { - table = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - } catch (Throwable e) { - LOG.warn(e.getMessage()); - return; - } - - TableType type = table.getType(); - if (jobInfo.analysisType != AnalysisType.INDEX || !type.equals(TableType.OLAP)) { - // not need to collect statistics for materialized view - return; - } - - OlapTable olapTable = (OlapTable) table; - - try { - olapTable.readLock(); - for (MaterializedIndexMeta meta : olapTable.getIndexIdToMeta().values()) { - if (meta.getDefineStmt() == null) { - continue; - } - long indexId = meta.getIndexId(); - long taskId = Env.getCurrentEnv().getNextId(); - AnalysisInfoBuilder indexTaskInfoBuilder = new AnalysisInfoBuilder(jobInfo); - AnalysisInfo analysisInfo = indexTaskInfoBuilder.setIndexId(indexId) - .setTaskId(taskId).setLastExecTimeInMs(System.currentTimeMillis()).build(); - jobInfo.addTaskId(taskId); - if (isSync) { - return; - } - analysisTasks.put(taskId, createTask(analysisInfo)); - logCreateAnalysisTask(analysisInfo); - } - } finally { - olapTable.readUnlock(); - } + replayCreateAnalysisJob(analysisInfo); } - private void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, + public void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, boolean isSync) throws DdlException { Map> columnToPartitions = jobInfo.colToPartitions; for (Entry> entry : columnToPartitions.entrySet()) { @@ -549,7 +586,7 @@ private void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, boolean isSync) throws DdlException { - TableIf table; - try { - table = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - } catch (Throwable e) { - LOG.warn(e.getMessage()); - return; - } - if (jobInfo.analysisType == AnalysisType.HISTOGRAM || !(table instanceof ExternalTable)) { + + if (jobInfo.analysisType == AnalysisType.HISTOGRAM) { return; } AnalysisInfoBuilder colTaskInfoBuilder = new AnalysisInfoBuilder(jobInfo); @@ -593,120 +625,49 @@ private void createTaskForExternalTable(AnalysisInfo jobInfo, return; } try { - logCreateAnalysisTask(analysisInfo); + replayCreateAnalysisTask(analysisInfo); } catch (Exception e) { throw new DdlException("Failed to create analysis task", e); } } public void updateTaskStatus(AnalysisInfo info, AnalysisState taskState, String message, long time) { - if (analysisJobIdToTaskMap.get(info.jobId) == null) { - return; - } - info.state = taskState; - info.message = message; - // Update the task cost time when task finished or failed. And only log the final state. - if (taskState.equals(AnalysisState.FINISHED) || taskState.equals(AnalysisState.FAILED)) { - info.timeCostInMs = time - info.lastExecTimeInMs; - info.lastExecTimeInMs = time; - logCreateAnalysisTask(info); - } - info.lastExecTimeInMs = time; - AnalysisInfo job = analysisJobInfoMap.get(info.jobId); - // Synchronize the job state change in job level. - synchronized (job) { - job.lastExecTimeInMs = time; - // Set the job state to RUNNING when its first task becomes RUNNING. - if (info.state.equals(AnalysisState.RUNNING) && job.state.equals(AnalysisState.PENDING)) { - job.state = AnalysisState.RUNNING; - replayCreateAnalysisJob(job); - } - boolean allFinished = true; - boolean hasFailure = false; - for (BaseAnalysisTask task : analysisJobIdToTaskMap.get(info.jobId).values()) { - AnalysisInfo taskInfo = task.info; - if (taskInfo.state.equals(AnalysisState.RUNNING) || taskInfo.state.equals(AnalysisState.PENDING)) { - allFinished = false; - break; - } - if (taskInfo.state.equals(AnalysisState.FAILED)) { - hasFailure = true; - } - } - if (allFinished) { - if (hasFailure) { - job.state = AnalysisState.FAILED; - logCreateAnalysisJob(job); - } else { - job.state = AnalysisState.FINISHED; - if (job.jobType.equals(JobType.SYSTEM)) { - try { - updateTableStats(job); - } catch (Throwable e) { - LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); - } - } - logCreateAnalysisJob(job); - } - analysisJobIdToTaskMap.remove(job.jobId); - } - } + TaskStatusWrapper taskStatusWrapper = new TaskStatusWrapper(info, taskState, message, time); + updaters[info.jobType.ordinal()].apply(taskStatusWrapper); } - private void updateTableStats(AnalysisInfo jobInfo) throws Throwable { - Map params = buildTableStatsParams(jobInfo); + @VisibleForTesting + public void updateTableStats(AnalysisInfo jobInfo) { TableIf tbl = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - - // update olap table stats - if (tbl.getType() == TableType.OLAP) { - OlapTable table = (OlapTable) tbl; - updateOlapTableStats(table, params); + // External Table update table stats after table level task finished. + if (tbl instanceof ExternalTable) { + return; + } + TableStatsMeta tableStats = findTableStatsStatus(tbl.getId()); + if (tableStats == null) { + updateTableStatsStatus(new TableStatsMeta(tbl.getId(), tbl.estimatedRowCount(), jobInfo)); + } else { + tableStats.updateByJob(jobInfo); + logCreateTableStats(tableStats); } - // External Table doesn't collect table stats here. - // We create task for external table to collect table/partition level statistics. - } - - @SuppressWarnings("rawtypes") - private Map buildTableStatsParams(AnalysisInfo jobInfo) throws Throwable { - CatalogIf catalog = StatisticsUtil.findCatalog(jobInfo.catalogName); - DatabaseIf db = StatisticsUtil.findDatabase(jobInfo.catalogName, jobInfo.dbName); - TableIf tbl = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - String indexId = String.valueOf(jobInfo.indexId); - String id = StatisticsUtil.constructId(tbl.getId(), indexId); - Map commonParams = new HashMap<>(); - commonParams.put("id", id); - commonParams.put("catalogId", String.valueOf(catalog.getId())); - commonParams.put("dbId", String.valueOf(db.getId())); - commonParams.put("tblId", String.valueOf(tbl.getId())); - commonParams.put("indexId", indexId); - commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis())); - return commonParams; } - private void updateOlapTableStats(OlapTable table, Map params) throws Throwable { - for (Partition partition : table.getPartitions()) { - HashMap partParams = Maps.newHashMap(params); - long rowCount = partition.getBaseIndex().getRowCount(); - partParams.put("id", StatisticsUtil - .constructId(params.get("id"), partition.getId())); - partParams.put("partId", String.valueOf(partition.getId())); - partParams.put("rowCount", String.valueOf(rowCount)); - StatisticsRepository.persistTableStats(partParams); + public List showAnalysisJob(ShowAnalyzeStmt stmt) { + if (stmt.isAuto()) { + // It's ok to sync on this field, it would only be assigned when instance init or do checkpoint + synchronized (autoJobs) { + return findShowAnalyzeResult(autoJobs, stmt); + } } - - HashMap tblParams = Maps.newHashMap(params); - long rowCount = table.getRowCount(); - tblParams.put("partId", "NULL"); - tblParams.put("rowCount", String.valueOf(rowCount)); - StatisticsRepository.persistTableStats(tblParams); + return findShowAnalyzeResult(analysisJobInfoMap.values(), stmt); } - public List showAnalysisJob(ShowAnalyzeStmt stmt) { + protected List findShowAnalyzeResult(Collection analysisInfos, ShowAnalyzeStmt stmt) { String state = stmt.getStateValue(); TableName tblName = stmt.getDbTableName(); - return analysisJobInfoMap.values().stream() + return analysisInfos.stream() .filter(a -> stmt.getJobId() == 0 || a.jobId == stmt.getJobId()) .filter(a -> state == null || a.state.equals(AnalysisState.valueOf(state))) .filter(a -> tblName == null || a.catalogName.equals(tblName.getCtl()) @@ -737,10 +698,11 @@ public String getJobProgress(long jobId) { break; } } - return String.format("%d Finished/%d Failed/%d In Progress/%d Total", finished, failed, inProgress, total); + return String.format(progressDisplayTemplate, finished, failed, inProgress, total); } - private void syncExecute(Collection tasks) { + @VisibleForTesting + public void syncExecute(Collection tasks) { SyncTaskCollection syncTaskCollection = new SyncTaskCollection(tasks); ConnectContext ctx = ConnectContext.get(); try { @@ -754,7 +716,8 @@ private void syncExecute(Collection tasks) { private ThreadPoolExecutor createThreadPoolForSyncAnalyze() { String poolName = "SYNC ANALYZE THREAD POOL"; - return new ThreadPoolExecutor(0, 64, + return new ThreadPoolExecutor(0, + ConnectContext.get().getSessionVariable().parallelSyncAnalyzeTaskNum, 0, TimeUnit.SECONDS, new SynchronousQueue(), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("SYNC ANALYZE" + "-%d") @@ -767,17 +730,23 @@ public void dropStats(DropStatsStmt dropStatsStmt) throws DdlException { Env.getCurrentEnv().getStatisticsCleaner().clear(); return; } + Set cols = dropStatsStmt.getColumnNames(); long tblId = dropStatsStmt.getTblId(); - StatisticsRepository.dropStatistics(tblId, cols); - for (String col : cols) { - Env.getCurrentEnv().getStatisticsCache().invalidate(tblId, -1L, col); + TableStatsMeta tableStats = findTableStatsStatus(dropStatsStmt.getTblId()); + if (tableStats == null) { + return; } - if (dropStatsStmt.dropTableRowCount()) { - StatisticsRepository.dropExternalTableStatistics(tblId); - // Table cache key doesn't care about catalog id and db id, because the table id is globally unique. - Env.getCurrentEnv().getStatisticsCache().invalidateTableStats(-1, -1, tblId); + if (cols == null) { + tableStats.reset(); + } else { + dropStatsStmt.getColumnNames().forEach(tableStats::removeColumn); + for (String col : cols) { + Env.getCurrentEnv().getStatisticsCache().invalidate(tblId, -1L, col); + } } + logCreateTableStats(tableStats); + StatisticsRepository.dropStatistics(tblId, cols); } public void handleKillAnalyzeStmt(KillAnalysisJobStmt killAnalysisJobStmt) throws DdlException { @@ -872,15 +841,15 @@ public void execute(ThreadPoolExecutor executor) { executor.submit(() -> { try { if (cancelled) { + errorMessages.add("Query timeout or user cancelled." + + "Could set analyze_timeout to a bigger value."); return; } try { task.execute(); - updateSyncTaskStatus(task, AnalysisState.FINISHED); } catch (Throwable t) { colNames.add(task.info.colName); errorMessages.add(Util.getRootCauseMessage(t)); - updateSyncTaskStatus(task, AnalysisState.FAILED); LOG.warn("Failed to analyze, info: {}", task, t); } } finally { @@ -898,32 +867,22 @@ public void execute(ThreadPoolExecutor executor) { + "] Reasons: " + String.join(",", errorMessages)); } } - - private void updateSyncTaskStatus(BaseAnalysisTask task, AnalysisState state) { - Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(task.info, state, "", System.currentTimeMillis()); - } - } - - public List findAutomaticAnalysisJobs() { - synchronized (analysisJobInfoMap) { - return analysisJobInfoMap.values().stream() - .filter(a -> - a.scheduleType.equals(ScheduleType.AUTOMATIC) - && (!(a.state.equals(AnalysisState.RUNNING) - || a.state.equals(AnalysisState.PENDING))) - && System.currentTimeMillis() - a.lastExecTimeInMs - > TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes)) - .collect(Collectors.toList()); - } } public List findPeriodicJobs() { synchronized (analysisJobInfoMap) { + Predicate p = a -> { + if (a.state.equals(AnalysisState.RUNNING)) { + return false; + } + if (a.cronExpression == null) { + return a.scheduleType.equals(ScheduleType.PERIOD) + && System.currentTimeMillis() - a.lastExecTimeInMs > a.periodTimeInMs; + } + return a.cronExpression.getTimeAfter(new Date(a.lastExecTimeInMs)).before(new Date()); + }; return analysisJobInfoMap.values().stream() - .filter(a -> a.scheduleType.equals(ScheduleType.PERIOD) - && (a.state.equals(AnalysisState.FINISHED)) - && System.currentTimeMillis() - a.lastExecTimeInMs > a.periodTimeInMs) + .filter(p) .collect(Collectors.toList()); } } @@ -937,7 +896,8 @@ public List findTasks(long jobId) { public List findTasksByTaskIds(long jobId) { AnalysisInfo jobInfo = analysisJobInfoMap.get(jobId); if (jobInfo != null && jobInfo.taskIds != null) { - return jobInfo.taskIds.stream().map(id -> analysisTaskInfoMap.get(id)).collect(Collectors.toList()); + return jobInfo.taskIds.stream().map(analysisTaskInfoMap::get).filter(i -> i != null) + .collect(Collectors.toList()); } return null; } @@ -963,34 +923,185 @@ public void dropAnalyzeJob(DropAnalyzeJobStmt analyzeJobStmt) throws DdlExceptio public static AnalysisManager readFields(DataInput in) throws IOException { AnalysisManager analysisManager = new AnalysisManager(); - doRead(in, analysisManager.analysisJobInfoMap, true); - doRead(in, analysisManager.analysisTaskInfoMap, false); + readAnalysisInfo(in, analysisManager.analysisJobInfoMap, true); + readAnalysisInfo(in, analysisManager.analysisTaskInfoMap, false); + readIdToTblStats(in, analysisManager.idToTblStats); + readAutoJobs(in, analysisManager); return analysisManager; } - private static void doRead(DataInput in, Map map, boolean job) throws IOException { + private static void readAnalysisInfo(DataInput in, Map map, boolean job) throws IOException { int size = in.readInt(); for (int i = 0; i < size; i++) { AnalysisInfo analysisInfo = AnalysisInfo.read(in); + // Unfinished manual once job/tasks doesn't need to keep in memory anymore. + if (needAbandon(analysisInfo)) { + continue; + } map.put(job ? analysisInfo.jobId : analysisInfo.taskId, analysisInfo); } } + // Need to abandon the unfinished manual once jobs/tasks while loading image and replay journal. + // Journal only store finished tasks and jobs. + public static boolean needAbandon(AnalysisInfo analysisInfo) { + if (analysisInfo == null) { + return true; + } + if ((AnalysisState.PENDING.equals(analysisInfo.state) || AnalysisState.RUNNING.equals(analysisInfo.state)) + && ScheduleType.ONCE.equals(analysisInfo.scheduleType) + && JobType.MANUAL.equals(analysisInfo.jobType)) { + return true; + } + return false; + } + + private static void readIdToTblStats(DataInput in, Map map) throws IOException { + int size = in.readInt(); + for (int i = 0; i < size; i++) { + TableStatsMeta tableStats = TableStatsMeta.read(in); + map.put(tableStats.tblId, tableStats); + } + } + + private static void readAutoJobs(DataInput in, AnalysisManager analysisManager) throws IOException { + Type type = new TypeToken>() {}.getType(); + Collection autoJobs = GsonUtils.GSON.fromJson(Text.readString(in), type); + analysisManager.autoJobs = analysisManager.createSimpleQueue(autoJobs, analysisManager); + } + @Override public void write(DataOutput out) throws IOException { - doWrite(out, analysisJobInfoMap); - doWrite(out, analysisTaskInfoMap); + writeJobInfo(out, analysisJobInfoMap); + writeJobInfo(out, analysisTaskInfoMap); + writeTableStats(out); + writeAutoJobsStatus(out); } - private void doWrite(DataOutput out, Map infoMap) throws IOException { + private void writeJobInfo(DataOutput out, Map infoMap) throws IOException { out.writeInt(infoMap.size()); for (Entry entry : infoMap.entrySet()) { entry.getValue().write(out); } } + private void writeTableStats(DataOutput out) throws IOException { + out.writeInt(idToTblStats.size()); + for (Entry entry : idToTblStats.entrySet()) { + entry.getValue().write(out); + } + } + + private void writeAutoJobsStatus(DataOutput output) throws IOException { + Type type = new TypeToken>() {}.getType(); + String autoJobs = GsonUtils.GSON.toJson(this.autoJobs, type); + Text.writeString(output, autoJobs); + } + // For unit test use only. public void addToJobIdTasksMap(long jobId, Map tasks) { analysisJobIdToTaskMap.put(jobId, tasks); } + + public TableStatsMeta findTableStatsStatus(long tblId) { + return idToTblStats.get(tblId); + } + + // Invoke this when load transaction finished. + public void updateUpdatedRows(long tblId, long rows) { + TableStatsMeta statsStatus = idToTblStats.get(tblId); + if (statsStatus != null) { + statsStatus.updatedRows.addAndGet(rows); + logCreateTableStats(statsStatus); + } + } + + public void updateTableStatsStatus(TableStatsMeta tableStats) { + replayUpdateTableStatsStatus(tableStats); + logCreateTableStats(tableStats); + } + + public void replayUpdateTableStatsStatus(TableStatsMeta tableStats) { + idToTblStats.put(tableStats.tblId, tableStats); + } + + public void logCreateTableStats(TableStatsMeta tableStats) { + Env.getCurrentEnv().getEditLog().logCreateTableStats(tableStats); + } + + public void registerSysJob(AnalysisInfo jobInfo, Map taskInfos) { + jobInfo.state = AnalysisState.RUNNING; + systemJobInfoMap.put(jobInfo.jobId, jobInfo); + analysisJobIdToTaskMap.put(jobInfo.jobId, taskInfos); + } + + @VisibleForTesting + protected Set findReAnalyzeNeededPartitions(TableIf table) { + TableStatsMeta tableStats = findTableStatsStatus(table.getId()); + if (tableStats == null) { + return table.getPartitionNames().stream().map(table::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + } + return table.getPartitionNames().stream() + .map(table::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) + .collect(Collectors.toSet()); + } + + protected void logAutoJob(AnalysisInfo autoJob) { + Env.getCurrentEnv().getEditLog().logAutoJob(autoJob); + } + + public void replayPersistSysJob(AnalysisInfo analysisInfo) { + autoJobs.offer(analysisInfo); + } + + protected SimpleQueue createSimpleQueue(Collection collection, + AnalysisManager analysisManager) { + return new SimpleQueue<>(Config.auto_analyze_job_record_count, + a -> { + // FE is not ready when replaying log and operations triggered by replaying + // shouldn't be logged again. + if (Env.getCurrentEnv().isReady() && Env.getCurrentEnv().isMaster() && !Env.isCheckpointThread()) { + analysisManager.logAutoJob(a); + } + return null; + }, + a -> { + // DO NOTHING + return null; + }, null); + } + + // Remove col stats status from TableStats if failed load some col stats after analyze corresponding column so that + // we could make sure it would be analyzed again soon if user or system submit job for that column again. + public void removeColStatsStatus(long tblId, String colName) { + TableStatsMeta tableStats = findTableStatsStatus(tblId); + if (tableStats != null) { + tableStats.removeColumn(colName); + } + } + + public void removeTableStats(long tblId) { + if (!idToTblStats.containsKey(tblId)) { + return; + } + TableStatsDeletionLog log = new TableStatsDeletionLog(tblId); + Env.getCurrentEnv().getEditLog().logDeleteTableStats(log); + replayTableStatsDeletion(log); + } + + public void replayTableStatsDeletion(TableStatsDeletionLog log) { + idToTblStats.remove(log.id); + } + + public ColStatsMeta findColStatsMeta(long tblId, String colName) { + TableStatsMeta tableStats = findTableStatsStatus(tblId); + if (tableStats == null) { + return null; + } + return tableStats.findColumnStatsMeta(colName); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java index bab8a462e8a5bd..3abc4c224faad2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java @@ -18,7 +18,9 @@ package org.apache.doris.statistics; public enum AnalysisState { + // When analyze job/task created, but never run PENDING, + // When analyze job/task is in running queue RUNNING, FINISHED, FAILED; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java index b5ec7aeb876807..4b133ce0ebfc68 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java @@ -17,6 +17,7 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Env; import org.apache.doris.common.Config; import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; @@ -35,26 +36,30 @@ public class AnalysisTaskExecutor extends Thread { private static final Logger LOG = LogManager.getLogger(AnalysisTaskExecutor.class); - private final ThreadPoolExecutor executors = ThreadPoolManager.newDaemonThreadPool( - Config.statistics_simultaneously_running_task_num, - Config.statistics_simultaneously_running_task_num, 0, - TimeUnit.DAYS, new LinkedBlockingQueue<>(), - new BlockedPolicy("Analysis Job Executor", Integer.MAX_VALUE), - "Analysis Job Executor", true); - - private final AnalysisTaskScheduler taskScheduler; + private final ThreadPoolExecutor executors; private final BlockingQueue taskQueue = new PriorityBlockingQueue(20, Comparator.comparingLong(AnalysisTaskWrapper::getStartTime)); - public AnalysisTaskExecutor(AnalysisTaskScheduler jobExecutor) { - this.taskScheduler = jobExecutor; + public AnalysisTaskExecutor(int simultaneouslyRunningTaskNum) { + if (!Env.isCheckpointThread()) { + executors = ThreadPoolManager.newDaemonThreadPool( + simultaneouslyRunningTaskNum, + simultaneouslyRunningTaskNum, 0, + TimeUnit.DAYS, new LinkedBlockingQueue<>(), + new BlockedPolicy("Analysis Job Executor", Integer.MAX_VALUE), + "Analysis Job Executor", true); + } else { + executors = null; + } } @Override public void run() { - fetchAndExecute(); + if (Env.isCheckpointThread()) { + return; + } cancelExpiredTask(); } @@ -82,22 +87,7 @@ private void doCancelExpiredJob() { } } - public void fetchAndExecute() { - Thread t = new Thread(() -> { - for (;;) { - try { - doFetchAndExecute(); - } catch (Throwable throwable) { - LOG.warn(throwable); - } - } - }, "Analysis Task Submitter"); - t.setDaemon(true); - t.start(); - } - - private void doFetchAndExecute() { - BaseAnalysisTask task = taskScheduler.getPendingTasks(); + public void submitTask(BaseAnalysisTask task) { AnalysisTaskWrapper taskWrapper = new AnalysisTaskWrapper(this, task); executors.submit(taskWrapper); } @@ -105,4 +95,13 @@ private void doFetchAndExecute() { public void putJob(AnalysisTaskWrapper wrapper) throws Exception { taskQueue.put(wrapper); } + + public boolean idle() { + return executors.getQueue().isEmpty(); + } + + public void clear() { + executors.getQueue().clear(); + taskQueue.clear(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java deleted file mode 100644 index 5c9de2b58b22b9..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java +++ /dev/null @@ -1,108 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.catalog.Env; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.Set; - -public class AnalysisTaskScheduler { - - private static final Logger LOG = LogManager.getLogger(AnalysisTaskScheduler.class); - - private final PriorityQueue systemJobQueue = - new PriorityQueue<>(Comparator.comparingLong(BaseAnalysisTask::getLastExecTime)); - - private final Queue manualJobQueue = new LinkedList<>(); - - private final Set systemJobSet = new HashSet<>(); - - private final Set manualJobSet = new HashSet<>(); - - public synchronized void schedule(BaseAnalysisTask analysisTask) { - try { - - switch (analysisTask.info.jobType) { - case MANUAL: - addToManualJobQueue(analysisTask); - break; - case SYSTEM: - addToSystemQueue(analysisTask); - break; - default: - throw new IllegalArgumentException("Unknown job type: " + analysisTask.info.jobType); - } - } catch (Throwable t) { - Env.getCurrentEnv().getAnalysisManager().updateTaskStatus( - analysisTask.info, AnalysisState.FAILED, t.getMessage(), System.currentTimeMillis()); - } - } - - // Make sure invoker of this method is synchronized on object. - - private void addToSystemQueue(BaseAnalysisTask analysisJobInfo) { - if (systemJobSet.contains(analysisJobInfo)) { - return; - } - systemJobSet.add(analysisJobInfo); - systemJobQueue.add(analysisJobInfo); - notify(); - } - - // Make sure invoker of this method is synchronized on object. - private void addToManualJobQueue(BaseAnalysisTask analysisJobInfo) { - if (manualJobSet.contains(analysisJobInfo)) { - return; - } - manualJobSet.add(analysisJobInfo); - manualJobQueue.add(analysisJobInfo); - notify(); - } - - public synchronized BaseAnalysisTask getPendingTasks() { - while (true) { - if (!manualJobQueue.isEmpty()) { - return pollAndRemove(manualJobQueue, manualJobSet); - } - if (!systemJobQueue.isEmpty()) { - return pollAndRemove(systemJobQueue, systemJobSet); - } - try { - wait(); - } catch (Exception e) { - LOG.warn("Thread get interrupted when waiting for pending jobs", e); - return null; - } - } - } - - // Poll from queue, remove from set. Make sure invoker of this method is synchronized on object. - private BaseAnalysisTask pollAndRemove(Queue q, Set s) { - BaseAnalysisTask t = q.poll(); - s.remove(t); - return t; - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java index 7f55469f533586..9aa3d85992b32c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java @@ -18,11 +18,15 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Env; +import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.Util; +import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.time.LocalTime; import java.util.concurrent.FutureTask; public class AnalysisTaskWrapper extends FutureTask { @@ -52,6 +56,14 @@ public void run() { if (task.killed) { return; } + if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.inAnalyzeTime( + LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + // TODO: Do we need a separate AnalysisState here? + Env.getCurrentEnv().getAnalysisManager() + .updateTaskStatus(task.info, AnalysisState.FAILED, "Auto task" + + "doesn't get executed within specified time range", System.currentTimeMillis()); + return; + } executor.putJob(this); super.run(); Object result = get(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 719df8769a43b9..af85b528247816 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -17,11 +17,16 @@ package org.apache.doris.statistics; +import org.apache.doris.analysis.TableSample; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.Config; import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.qe.AuditLogHelper; +import org.apache.doris.qe.QueryState; +import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; @@ -64,7 +69,7 @@ public abstract class BaseAnalysisTask { protected static final String INSERT_COL_STATISTICS = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, " - + " ndv, null_count, min, max, data_size, update_time\n" + + " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n" + " FROM \n" + " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + " ${catalogId} AS catalog_id, " @@ -78,7 +83,7 @@ public abstract class BaseAnalysisTask { + " MIN(CAST(min AS ${type})) AS min, " + " MAX(CAST(max AS ${type})) AS max, " + " SUM(data_size_in_bytes) AS data_size, " - + " NOW() AS update_time\n" + + " NOW() AS update_time \n" + " FROM ${internalDB}.${columnStatTbl}" + " WHERE ${internalDB}.${columnStatTbl}.db_id = '${dbId}' AND " + " ${internalDB}.${columnStatTbl}.tbl_id='${tblId}' AND " @@ -101,6 +106,8 @@ public abstract class BaseAnalysisTask { protected volatile boolean killed; + protected TableSample tableSample = null; + @VisibleForTesting public BaseAnalysisTask() { @@ -111,7 +118,7 @@ public BaseAnalysisTask(AnalysisInfo info) { init(info); } - private void init(AnalysisInfo info) { + protected void init(AnalysisInfo info) { catalog = Env.getCurrentEnv().getCatalogMgr().getCatalog(info.catalogName); if (catalog == null) { Env.getCurrentEnv().getAnalysisManager().updateTaskStatus(info, AnalysisState.FAILED, @@ -130,6 +137,7 @@ private void init(AnalysisInfo info) { info, AnalysisState.FAILED, String.format("Table with name %s not exists", info.tblName), System.currentTimeMillis()); } + tableSample = getTableSample(); // External Table level task doesn't contain a column. Don't need to do the column related analyze. if (info.externalTableLevelTask) { return; @@ -143,7 +151,6 @@ private void init(AnalysisInfo info) { Preconditions.checkArgument(!StatisticsUtil.isUnsupportedType(col.getType()), String.format("Column with type %s is not supported", col.getType().toString())); } - } public void execute() { @@ -166,6 +173,9 @@ protected void executeWithRetry() { doExecute(); break; } catch (Throwable t) { + if (killed) { + throw new RuntimeException(t); + } LOG.warn("Failed to execute analysis task, retried times: {}", retriedTimes++, t); if (retriedTimes > StatisticConstants.ANALYZE_TASK_RETRY_TIMES) { throw new RuntimeException(t); @@ -181,12 +191,16 @@ protected void afterExecution() { if (killed) { return; } - Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); + long tblId = tbl.getId(); + String colName = col.getName(); + if (!Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tblId, -1, colName)) { + Env.getCurrentEnv().getAnalysisManager().removeColStatsStatus(tblId, colName); + } } protected void setTaskStateToRunning() { Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(info, AnalysisState.RUNNING, "", System.currentTimeMillis()); + .updateTaskStatus(info, AnalysisState.RUNNING, "", System.currentTimeMillis()); } public void cancel() { @@ -199,10 +213,6 @@ public void cancel() { String.format("Job has been cancelled: %s", info.message), System.currentTimeMillis()); } - public long getLastExecTime() { - return info.lastExecTimeInMs; - } - public long getJobId() { return info.jobId; } @@ -215,22 +225,49 @@ protected String getDataSizeFunction(Column column) { return "COUNT(1) * " + column.getType().getSlotSize(); } - protected String getSampleExpression() { - if (info.analysisMethod == AnalysisMethod.FULL) { - return ""; + protected TableSample getTableSample() { + if (info.forceFull) { + return null; } - // TODO Add sampling methods for external tables + // If user specified sample percent or sample rows, use it. if (info.samplePercent > 0) { - return String.format("TABLESAMPLE(%d PERCENT)", info.samplePercent); + return new TableSample(true, (long) info.samplePercent); + } else if (info.sampleRows > 0) { + return new TableSample(false, info.sampleRows); + } else if (info.analysisMethod == AnalysisMethod.FULL + && Config.enable_auto_sample + && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { + // If user doesn't specify sample percent/rows, use auto sample and update sample rows in analysis info. + return new TableSample(false, (long) Config.huge_table_default_sample_rows); } else { - return String.format("TABLESAMPLE(%d ROWS)", info.sampleRows); + return null; } } @Override public String toString() { return String.format("Job id [%d], Task id [%d], catalog [%s], db [%s], table [%s], column [%s]", - info.jobId, info.taskId, catalog.getName(), db.getFullName(), tbl.getName(), - col == null ? "TableRowCount" : col.getName()); + info.jobId, info.taskId, catalog.getName(), db.getFullName(), tbl.getName(), + col == null ? "TableRowCount" : col.getName()); + } + + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + if (killed) { + return; + } + LOG.debug("execute internal sql: {}", stmtExecutor.getOriginStmt()); + try { + stmtExecutor.execute(); + QueryState queryState = stmtExecutor.getContext().getState(); + if (queryState.getStateType().equals(MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, stmtExecutor.getOriginStmt().toString(), + queryState.getErrorMessage())); + } + } finally { + AuditLogHelper.logAuditLog(stmtExecutor.getContext(), stmtExecutor.getOriginStmt().toString(), + stmtExecutor.getParsedStmt(), stmtExecutor.getQueryStatisticsForAuditLog(), + true); + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java new file mode 100644 index 00000000000000..7addfb0c539c1d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.util.StatisticsUtil; + +import java.util.StringJoiner; + +/** + * Used to convert data from ResultRow. + * 0: id + * 1: catalog_id + * 2: db_id + * 3: tbl_id + * 4: idx_id + * 5: col_id + * 6: part_id + * 7: count + * 8: ndv + * 9: null_count + * 10: min + * 11: max + * 12: data_size_in_bytes + * 13: update_time + */ +public class ColStatsData { + public final StatsId statsId; + public final long count; + public final long ndv; + + public final long nullCount; + + public final String minLit; + public final String maxLit; + + public final long dataSizeInBytes; + + public final String updateTime; + + public ColStatsData(ResultRow row) { + this.statsId = new StatsId(row); + this.count = (long) Double.parseDouble(row.get(7)); + this.ndv = (long) Double.parseDouble(row.getWithDefault(8, "0")); + this.nullCount = (long) Double.parseDouble(row.getWithDefault(9, "0")); + this.minLit = row.get(10); + this.maxLit = row.get(11); + this.dataSizeInBytes = (long) Double.parseDouble(row.getWithDefault(12, "0")); + this.updateTime = row.get(13); + } + + public String toSQL(boolean roundByParentheses) { + StringJoiner sj = null; + if (roundByParentheses) { + sj = new StringJoiner(",", "(" + statsId.toSQL() + ",", ")"); + } else { + sj = new StringJoiner(",", statsId.toSQL(), ""); + } + sj.add(String.valueOf(count)); + sj.add(String.valueOf(ndv)); + sj.add(String.valueOf(nullCount)); + sj.add(StatisticsUtil.quote(minLit)); + sj.add(StatisticsUtil.quote(maxLit)); + sj.add(String.valueOf(dataSizeInBytes)); + sj.add(StatisticsUtil.quote(updateTime)); + return sj.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java new file mode 100644 index 00000000000000..445641b2505610 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.AnalysisType; +import org.apache.doris.statistics.AnalysisInfo.JobType; + +import com.google.gson.annotations.SerializedName; + +import java.util.concurrent.atomic.AtomicLong; + +public class ColStatsMeta { + + @SerializedName("updateTime") + public long updatedTime; + + @SerializedName("method") + public AnalysisMethod analysisMethod; + + @SerializedName("type") + public AnalysisType analysisType; + + @SerializedName("queriedTimes") + public final AtomicLong queriedTimes = new AtomicLong(); + + // TODO: For column that manually analyzed, we should use same analyze method as user specified. + @SerializedName("trigger") + public JobType jobType; + + public ColStatsMeta(long updatedTime, AnalysisMethod analysisMethod, + AnalysisType analysisType, JobType jobType, long queriedTimes) { + this.updatedTime = updatedTime; + this.analysisMethod = analysisMethod; + this.analysisType = analysisType; + this.jobType = jobType; + this.queriedTimes.addAndGet(queriedTimes); + } + + public void clear() { + updatedTime = 0; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index 7986cb07a57b0c..c6b019f669b65d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -19,26 +19,26 @@ import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONObject; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class ColumnStatistic { + public static final double STATS_ERROR = 0.1D; + public static final StatsType NDV = StatsType.NDV; public static final StatsType AVG_SIZE = StatsType.AVG_SIZE; public static final StatsType MAX_SIZE = StatsType.MAX_SIZE; @@ -50,30 +50,17 @@ public class ColumnStatistic { public static ColumnStatistic UNKNOWN = new ColumnStatisticBuilder().setAvgSizeByte(1).setNdv(1) .setNumNulls(1).setCount(1).setMaxValue(Double.POSITIVE_INFINITY).setMinValue(Double.NEGATIVE_INFINITY) - .setSelectivity(1.0).setIsUnknown(true) + .setIsUnknown(true).setUpdatedTime("") .build(); public static ColumnStatistic ZERO = new ColumnStatisticBuilder().setAvgSizeByte(0).setNdv(0) .setNumNulls(0).setCount(0).setMaxValue(Double.NaN).setMinValue(Double.NaN) - .setSelectivity(0) .build(); - public static final Set UNSUPPORTED_TYPE = new HashSet<>(); - - static { - UNSUPPORTED_TYPE.add(Type.HLL); - UNSUPPORTED_TYPE.add(Type.BITMAP); - UNSUPPORTED_TYPE.add(Type.ARRAY); - UNSUPPORTED_TYPE.add(Type.STRUCT); - UNSUPPORTED_TYPE.add(Type.MAP); - UNSUPPORTED_TYPE.add(Type.QUANTILE_STATE); - UNSUPPORTED_TYPE.add(Type.AGG_STATE); - UNSUPPORTED_TYPE.add(Type.JSONB); - UNSUPPORTED_TYPE.add(Type.VARIANT); - UNSUPPORTED_TYPE.add(Type.TIME); - UNSUPPORTED_TYPE.add(Type.TIMEV2); - UNSUPPORTED_TYPE.add(Type.LAMBDA_FUNCTION); - } + public static final Set UNSUPPORTED_TYPE = Sets.newHashSet( + Type.HLL, Type.BITMAP, Type.ARRAY, Type.STRUCT, Type.MAP, Type.QUANTILE_STATE, Type.AGG_STATE, Type.JSONB, + Type.VARIANT, Type.TIME, Type.TIMEV2, Type.LAMBDA_FUNCTION + ); @SerializedName("count") public final double count; @@ -90,19 +77,6 @@ public class ColumnStatistic { @SerializedName("maxValue") public final double maxValue; public final boolean isUnKnown; - /* - selectivity of Column T1.A: - if T1.A = T2.B is the inner join condition, for a given `b` in B, b in - intersection of range(A) and range(B), selectivity means the probability that - the equation can be satisfied. - We take tpch as example. - l_orderkey = o_orderkey and o_orderstatus='o' - there are 3 distinct o_orderstatus in orders table. filter o_orderstatus='o' reduces orders table by 1/3 - because o_orderkey is primary key, thus the o_orderkey.selectivity = 1/3, - and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. - But after filter, other columns' selectivity is still 1.0 - */ - public final double selectivity; /* originalNdv is the ndv in stats of ScanNode. ndv may be changed after filter or join, @@ -111,7 +85,6 @@ and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. */ public final ColumnStatistic original; - // For display only. public final LiteralExpr minExpr; public final LiteralExpr maxExpr; @@ -119,14 +92,17 @@ and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. // assign value when do stats estimation. public final Histogram histogram; - public final Map partitionIdToColStats = new HashMap<>(); + @SerializedName("partitionIdToColStats") + public final Map partitionIdToColStats = new HashMap<>(); public final String updatedTime; + public final PartitionInfo partitionInfo; + public ColumnStatistic(double count, double ndv, ColumnStatistic original, double avgSizeByte, double numNulls, double dataSize, double minValue, double maxValue, - double selectivity, LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram, - String updatedTime) { + LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram, + String updatedTime, PartitionInfo partitionInfo) { this.count = count; this.ndv = ndv; this.original = original; @@ -135,31 +111,30 @@ public ColumnStatistic(double count, double ndv, ColumnStatistic original, doubl this.dataSize = dataSize; this.minValue = minValue; this.maxValue = maxValue; - this.selectivity = selectivity; this.minExpr = minExpr; this.maxExpr = maxExpr; this.isUnKnown = isUnKnown; this.histogram = histogram; this.updatedTime = updatedTime; + this.partitionInfo = partitionInfo; } public static ColumnStatistic fromResultRow(List resultRows) { - Map partitionIdToColStats = new HashMap<>(); + Map partitionIdToColStats = new HashMap<>(); ColumnStatistic columnStatistic = null; try { for (ResultRow resultRow : resultRows) { - String partId = resultRow.getColumnValue("part_id"); + String partId = resultRow.get(6); if (partId == null) { columnStatistic = fromResultRow(resultRow); } else { - partitionIdToColStats.put(Long.parseLong(partId), fromResultRow(resultRow)); + partitionIdToColStats.put(partId, fromResultRow(resultRow)); } } } catch (Throwable t) { LOG.debug("Failed to deserialize column stats", t); return ColumnStatistic.UNKNOWN; } - // Means last analyze failed or interrupted for some reason. if (columnStatistic == null) { return ColumnStatistic.UNKNOWN; } @@ -168,47 +143,44 @@ public static ColumnStatistic fromResultRow(List resultRows) { } // TODO: use thrift - public static ColumnStatistic fromResultRow(ResultRow resultRow) { + public static ColumnStatistic fromResultRow(ResultRow row) { try { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); - double count = Double.parseDouble(resultRow.getColumnValueWithDefault("count", "0")); + double count = Double.parseDouble(row.get(7)); columnStatisticBuilder.setCount(count); - double ndv = Double.parseDouble(resultRow.getColumnValueWithDefault("ndv", "0")); - if (0.99 * count < ndv && ndv < 1.01 * count) { - ndv = count; - } + double ndv = Double.parseDouble(row.getWithDefault(8, "0")); columnStatisticBuilder.setNdv(ndv); - String nullCount = resultRow.getColumnValueWithDefault("null_count", "0"); + String nullCount = row.getWithDefault(9, "0"); columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount)); columnStatisticBuilder.setDataSize(Double - .parseDouble(resultRow.getColumnValueWithDefault("data_size_in_bytes", "0"))); + .parseDouble(row.getWithDefault(12, "0"))); columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0 ? 0 : columnStatisticBuilder.getDataSize() / columnStatisticBuilder.getCount()); - long catalogId = Long.parseLong(resultRow.getColumnValue("catalog_id")); - long idxId = Long.parseLong(resultRow.getColumnValue("idx_id")); - long dbID = Long.parseLong(resultRow.getColumnValue("db_id")); - long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id")); - String colName = resultRow.getColumnValue("col_id"); + long catalogId = Long.parseLong(row.get(1)); + long idxId = Long.parseLong(row.get(4)); + long dbID = Long.parseLong(row.get(2)); + long tblId = Long.parseLong(row.get(3)); + String colName = row.get(5); Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName); if (col == null) { - LOG.warn("Failed to deserialize column statistics, ctlId: {} dbId: {}" - + "tblId: {} column: {} not exists", + LOG.debug("Failed to deserialize column statistics, ctlId: {} dbId: {}" + + "tblId: {} column: {} not exists", catalogId, dbID, tblId, colName); return ColumnStatistic.UNKNOWN; } - String min = resultRow.getColumnValue("min"); - String max = resultRow.getColumnValue("max"); + String min = row.get(10); + String max = row.get(11); if (min != null && !min.equalsIgnoreCase("NULL")) { try { columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min)); columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min)); } catch (AnalysisException e) { LOG.warn("Failed to deserialize column {} min value {}.", col, min, e); - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } } else { - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } if (max != null && !max.equalsIgnoreCase("NULL")) { try { @@ -216,16 +188,12 @@ public static ColumnStatistic fromResultRow(ResultRow resultRow) { columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max)); } catch (AnalysisException e) { LOG.warn("Failed to deserialize column {} max value {}.", col, max, e); - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); } } else { - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); } - columnStatisticBuilder.setSelectivity(1.0); - Histogram histogram = Env.getCurrentEnv().getStatisticsCache().getHistogram(tblId, idxId, colName) - .orElse(null); - columnStatisticBuilder.setHistogram(histogram); - columnStatisticBuilder.setUpdatedTime(resultRow.getColumnValue("update_time")); + columnStatisticBuilder.setUpdatedTime(row.get(13)); return columnStatisticBuilder.build(); } catch (Exception e) { LOG.warn("Failed to deserialize column statistics.", e); @@ -237,25 +205,12 @@ public static boolean isAlmostUnique(double ndv, double rowCount) { return rowCount * 0.9 < ndv && ndv < rowCount * 1.1; } - public ColumnStatistic copy() { - return new ColumnStatisticBuilder().setCount(count).setNdv(ndv).setAvgSizeByte(avgSizeByte) - .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(minValue) - .setMaxValue(maxValue).setMinExpr(minExpr).setMaxExpr(maxExpr) - .setSelectivity(selectivity).setIsUnknown(isUnKnown).build(); - } - public ColumnStatistic updateByLimit(long limit, double rowCount) { double ratio = 0; if (rowCount != 0) { ratio = limit / rowCount; } double newNdv = Math.ceil(Math.min(ndv, limit)); - double newSelectivity = selectivity; - if (newNdv != 0) { - newSelectivity = newSelectivity * newNdv / ndv; - } else { - newSelectivity = 0; - } return new ColumnStatisticBuilder() .setCount(Math.ceil(limit)) .setNdv(newNdv) @@ -266,7 +221,6 @@ public ColumnStatistic updateByLimit(long limit, double rowCount) { .setMaxValue(maxValue) .setMinExpr(minExpr) .setMaxExpr(maxExpr) - .setSelectivity(newSelectivity) .setIsUnknown(isUnKnown) .build(); } @@ -282,14 +236,11 @@ public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) ColumnStatisticBuilder builder = new ColumnStatisticBuilder(this); Double rowsAfterFilter = rowCount * selectivity; if (isAlmostUnique(ndv, rowCount)) { - builder.setSelectivity(this.selectivity * selectivity); builder.setNdv(ndv * selectivity); } else { if (ndv > rowsAfterFilter) { - builder.setSelectivity(this.selectivity * rowsAfterFilter / ndv); builder.setNdv(rowsAfterFilter); } else { - builder.setSelectivity(this.selectivity); builder.setNdv(this.ndv); } } @@ -332,8 +283,8 @@ public boolean enclosed(ColumnStatistic other) { @Override public String toString() { - return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f", - ndv, minValue, minExpr, maxValue, maxExpr, count); + return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f", + ndv, minValue, minExpr, maxValue, maxExpr, count, avgSizeByte); } public JSONObject toJson() { @@ -355,17 +306,16 @@ public JSONObject toJson() { statistic.put("MaxValueType", "Normal"); statistic.put("MaxValue", maxValue); } - statistic.put("Selectivity", selectivity); statistic.put("Count", count); statistic.put("AvgSizeByte", avgSizeByte); statistic.put("NumNulls", numNulls); statistic.put("DataSize", dataSize); - statistic.put("Selectivity", selectivity); statistic.put("MinExpr", minExpr); statistic.put("MaxExpr", maxExpr); statistic.put("IsUnKnown", isUnKnown); statistic.put("Histogram", Histogram.serializeToJson(histogram)); statistic.put("Original", original); + statistic.put("LastUpdatedTime", updatedTime); return statistic; } @@ -410,12 +360,11 @@ public static ColumnStatistic fromJson(String statJson) { stat.getDouble("DataSize"), minValue, maxValue, - stat.getDouble("Selectivity"), null, null, stat.getBoolean("IsUnKnown"), Histogram.deserializeFromJson(stat.getString("Histogram")), - stat.getString("lastUpdatedTine") + stat.getString("LastUpdatedTime"), null ); } @@ -443,12 +392,7 @@ public boolean isUnKnown() { return isUnKnown; } - public void loadPartitionStats(long tableId, long idxId, String colName) throws DdlException { - List resultRows = StatisticsRepository.loadPartStats(tableId, idxId, colName); - for (ResultRow resultRow : resultRows) { - String partId = resultRow.getColumnValue("part_id"); - ColumnStatistic columnStatistic = ColumnStatistic.fromResultRow(resultRow); - partitionIdToColStats.put(Long.parseLong(partId), columnStatistic); - } + public void putPartStats(String partId, ColumnStatistic columnStatistic) { + this.partitionIdToColStats.put(partId, columnStatistic); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index 6ca2cc55b7922f..fa4cf7ebc99cb4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -18,6 +18,10 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.catalog.PartitionInfo; + +import java.util.HashMap; +import java.util.Map; public class ColumnStatisticBuilder { private double count; @@ -27,7 +31,6 @@ public class ColumnStatisticBuilder { private double dataSize; private double minValue; private double maxValue; - private double selectivity = 1.0; private LiteralExpr minExpr; private LiteralExpr maxExpr; @@ -37,11 +40,24 @@ public class ColumnStatisticBuilder { private ColumnStatistic original; + private Map partitionIdToColStats = new HashMap<>(); + private String updatedTime; + private PartitionInfo partitionInfo; + public ColumnStatisticBuilder() { } + public PartitionInfo getPartitionInfo() { + return partitionInfo; + } + + public ColumnStatisticBuilder setPartitionInfo(PartitionInfo partitionInfo) { + this.partitionInfo = partitionInfo; + return this; + } + public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.count = columnStatistic.count; this.ndv = columnStatistic.ndv; @@ -50,13 +66,14 @@ public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.dataSize = columnStatistic.dataSize; this.minValue = columnStatistic.minValue; this.maxValue = columnStatistic.maxValue; - this.selectivity = columnStatistic.selectivity; this.minExpr = columnStatistic.minExpr; this.maxExpr = columnStatistic.maxExpr; this.isUnknown = columnStatistic.isUnKnown; this.histogram = columnStatistic.histogram; this.original = columnStatistic.original; + this.partitionIdToColStats.putAll(columnStatistic.partitionIdToColStats); this.updatedTime = columnStatistic.updatedTime; + this.partitionInfo = columnStatistic.partitionInfo; } public ColumnStatisticBuilder setCount(double count) { @@ -99,11 +116,6 @@ public ColumnStatisticBuilder setMaxValue(double maxValue) { return this; } - public ColumnStatisticBuilder setSelectivity(double selectivity) { - this.selectivity = selectivity; - return this; - } - public ColumnStatisticBuilder setMinExpr(LiteralExpr minExpr) { this.minExpr = minExpr; return this; @@ -147,10 +159,6 @@ public double getMaxValue() { return maxValue; } - public double getSelectivity() { - return selectivity; - } - public LiteralExpr getMinExpr() { return minExpr; } @@ -176,18 +184,23 @@ public String getUpdatedTime() { return updatedTime; } - public void setUpdatedTime(String updatedTime) { + public ColumnStatisticBuilder setUpdatedTime(String updatedTime) { this.updatedTime = updatedTime; + return this; } public ColumnStatistic build() { dataSize = Math.max((count - numNulls + 1) * avgSizeByte, 0); - if (original == null) { + if (original == null && !isUnknown) { original = new ColumnStatistic(count, ndv, null, avgSizeByte, numNulls, - dataSize, minValue, maxValue, selectivity, minExpr, maxExpr, false, - histogram, updatedTime); + dataSize, minValue, maxValue, minExpr, maxExpr, + isUnknown, histogram, updatedTime, partitionInfo); + original.partitionIdToColStats.putAll(partitionIdToColStats); } - return new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls, - dataSize, minValue, maxValue, selectivity, minExpr, maxExpr, isUnknown, histogram, updatedTime); + ColumnStatistic colStats = new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls, + dataSize, minValue, maxValue, minExpr, maxExpr, + isUnknown, histogram, updatedTime, partitionInfo); + colStats.partitionIdToColStats.putAll(partitionIdToColStats); + return colStats; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java index d94a90b75f0c55..281a0e8250206f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java @@ -19,7 +19,8 @@ import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.common.ThreadPoolManager; +import org.apache.doris.qe.InternalQueryExecutionException; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.logging.log4j.LogManager; @@ -27,16 +28,23 @@ import java.util.List; import java.util.Optional; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.ThreadPoolExecutor.DiscardOldestPolicy; public class ColumnStatisticsCacheLoader extends StatisticsCacheLoader> { private static final Logger LOG = LogManager.getLogger(ColumnStatisticsCacheLoader.class); + private static final ThreadPoolExecutor singleThreadPool = ThreadPoolManager.newDaemonFixedThreadPool( + StatisticConstants.RETRY_LOAD_THREAD_POOL_SIZE, + StatisticConstants.RETRY_LOAD_QUEUE_SIZE, "STATS_RELOAD", + true, + new DiscardOldestPolicy()); + @Override protected Optional doLoad(StatisticsCacheKey key) { // Load from statistics table. - Optional columnStatistic = loadFromStatsTable(key.tableId, - key.idxId, key.colName); + Optional columnStatistic = loadFromStatsTable(key); if (columnStatistic.isPresent()) { return columnStatistic; } @@ -52,8 +60,14 @@ protected Optional doLoad(StatisticsCacheKey key) { return columnStatistic; } - private Optional loadFromStatsTable(long tableId, long idxId, String colName) { - List columnResults = StatisticsRepository.loadColStats(tableId, idxId, colName); + private Optional loadFromStatsTable(StatisticsCacheKey key) { + List columnResults = null; + try { + columnResults = StatisticsRepository.loadColStats(key.tableId, key.idxId, key.colName); + } catch (InternalQueryExecutionException e) { + retryLoad(key); + return Optional.empty(); + } ColumnStatistic columnStatistics; try { columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults); @@ -67,4 +81,42 @@ private Optional loadFromStatsTable(long tableId, long idxId, S return Optional.of(columnStatistics); } } + + private void retryLoad(StatisticsCacheKey key) { + singleThreadPool.submit(new RetryTask(key, 1)); + } + + private static class RetryTask implements Runnable { + StatisticsCacheKey key; + int retryTimes; + + public RetryTask(StatisticsCacheKey key, int retryTimes) { + this.key = key; + this.retryTimes = retryTimes; + } + + @Override + public void run() { + List columnResults = null; + try { + columnResults = StatisticsRepository.loadColStats(key.tableId, key.idxId, key.colName); + } catch (InternalQueryExecutionException e) { + if (this.retryTimes < StatisticConstants.LOAD_RETRY_TIMES) { + retryTimes++; + singleThreadPool.submit(this); + } + return; + } + ColumnStatistic columnStatistics; + try { + columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults); + } catch (Exception e) { + LOG.warn("Exception to deserialize column statistics", e); + return; + } + if (columnStatistics != null) { + Env.getCurrentEnv().getStatisticsCache().putCache(key, columnStatistics); + } + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index d569cd79bd4aa0..836e6e6e493728 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -17,29 +17,28 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.FeConstants; -import org.apache.doris.common.util.TimeUtils; import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.util.InternalQueryResult; import org.apache.doris.statistics.util.StatisticsUtil; -import org.apache.commons.lang3.StringUtils; +import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; public class HMSAnalysisTask extends BaseAnalysisTask { private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class); @@ -49,7 +48,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { public static final String NUM_FILES = "numFiles"; public static final String TIMESTAMP = "transient_lastDdlTime"; - private static final String ANALYZE_SQL_TABLE_TEMPLATE = "INSERT INTO " + private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " @@ -59,19 +58,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "COUNT(1) AS row_count, " + + "${countExpr} AS row_count, " + "NDV(`${colName}`) AS ndv, " - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "${nullCountExpr} AS null_count, " + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; - private static final String ANALYZE_SQL_PARTITION_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + private static final String ANALYZE_PARTITION_TEMPLATE = " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " + "${tblId} AS tbl_id, " @@ -84,22 +81,22 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " - + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; - private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT COUNT(1) as rowCount " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ${countExpr} as rowCount " + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; + + // cache stats for each partition, it would be inserted into column_statistics in a batch. + private final List> buf = new ArrayList<>(); private final boolean isTableLevelTask; - private final boolean isSamplingPartition; private final boolean isPartitionOnly; - private final Set partitionNames; + private Set partitionNames; private HMSExternalTable table; public HMSAnalysisTask(AnalysisInfo info) { super(info); isTableLevelTask = info.externalTableLevelTask; - isSamplingPartition = info.samplingPartition; isPartitionOnly = info.partitionOnly; partitionNames = info.partitionNames; table = (HMSExternalTable) tbl; @@ -114,42 +111,17 @@ public void doExecute() throws Exception { } /** - * Get table row count and insert the result to __internal_schema.table_statistics + * Get table row count */ private void getTableStats() throws Exception { - // Get table level information. An example sql for table stats: - // INSERT INTO __internal_schema.table_statistics VALUES - // ('13055', 13002, 13038, 13055, -1, 'NULL', 5, 1686111064658, NOW()) - Map parameters = table.getRemoteTable().getParameters(); - if (isPartitionOnly) { - for (String partId : partitionNames) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_PARTITION_TEMPLATE); - sb.append(" where "); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String value = splits[i].split("=")[1]; - splits[i] = splits[i].replace(value, "\'" + value + "\'"); - } - sb.append(StringUtils.join(splits, " and ")); - Map params = buildTableStatsParams(partId); - setParameterData(parameters, params); - List columnResult = - StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) - .replace(sb.toString())); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); - } - } else { - Map params = buildTableStatsParams(null); - List columnResult = - StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) - .replace(ANALYZE_TABLE_COUNT_TEMPLATE)); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); - } + Map params = buildTableStatsParams(null); + List columnResult = + StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) + .replace(ANALYZE_TABLE_COUNT_TEMPLATE)); + String rowCount = columnResult.get(0).get(0); + Env.getCurrentEnv().getAnalysisManager() + .updateTableStatsStatus( + new TableStatsMeta(table.getId(), Long.parseLong(rowCount), info)); } /** @@ -173,67 +145,102 @@ private void getTableColumnStats() throws Exception { // 0 AS data_size, // NOW() FROM `hive`.`tpch100`.`region` if (isPartitionOnly) { - for (String partId : partitionNames) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - sb.append(" where "); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String value = splits[i].split("=")[1]; - splits[i] = splits[i].replace(value, "\'" + value + "\'"); - } - sb.append(StringUtils.join(splits, " and ")); - Map params = buildTableStatsParams(partId); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("colName", col.getName()); - params.put("colId", info.colName); - params.put("dataSizeFunction", getDataSizeFunction(col)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); + getPartitionNames(); + List partitionAnalysisSQLs = new ArrayList<>(); + for (String partId : this.partitionNames) { + partitionAnalysisSQLs.add(generateSqlForPartition(partId)); } + execSQLs(partitionAnalysisSQLs); } else { StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - if (isSamplingPartition) { - sb.append(" where 1=1 "); - String[] splitExample = partitionNames.stream().findFirst().get().split("/"); - int parts = splitExample.length; - List partNames = new ArrayList<>(); - for (String split : splitExample) { - partNames.add(split.split("=")[0]); - } - List> valueLists = new ArrayList<>(); - for (int i = 0; i < parts; i++) { - valueLists.add(new ArrayList<>()); - } - for (String partId : partitionNames) { - String[] partIds = partId.split("/"); - for (int i = 0; i < partIds.length; i++) { - valueLists.get(i).add("\'" + partIds[i].split("=")[1] + "\'"); - } - } - for (int i = 0; i < parts; i++) { - sb.append(" and "); - sb.append(partNames.get(i)); - sb.append(" in ("); - sb.append(StringUtils.join(valueLists.get(i), ",")); - sb.append(") "); - } - } + sb.append(ANALYZE_TABLE_TEMPLATE); Map params = buildTableStatsParams("NULL"); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); params.put("colName", col.getName()); params.put("colId", info.colName); params.put("dataSizeFunction", getDataSizeFunction(col)); + params.put("nullCountExpr", getNullCountExpression()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); executeInsertSql(sql); } } + private void getPartitionNames() { + if (partitionNames == null) { + if (info.isAllPartition) { + partitionNames = table.getPartitionNames(); + } else if (info.partitionCount > 0) { + partitionNames = table.getPartitionNames().stream() + .limit(info.partitionCount).collect(Collectors.toSet()); + } + if (partitionNames == null || partitionNames.isEmpty()) { + throw new RuntimeException("Not a partition table or no partition specified."); + } + } + } + + private String generateSqlForPartition(String partId) { + StringBuilder sb = new StringBuilder(); + sb.append(ANALYZE_PARTITION_TEMPLATE); + String[] splits = partId.split("/"); + for (int i = 0; i < splits.length; i++) { + String[] kv = splits[i].split("="); + sb.append(kv[0]); + sb.append("='"); + sb.append(kv[1]); + sb.append("'"); + if (i < splits.length - 1) { + sb.append(" and "); + } + } + Map params = buildTableStatsParams(partId); + params.put("internalDB", FeConstants.INTERNAL_DB_NAME); + params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); + params.put("colName", col.getName()); + params.put("colId", info.colName); + params.put("dataSizeFunction", getDataSizeFunction(col)); + return new StringSubstitutor(params).replace(sb.toString()); + } + + public void execSQLs(List partitionAnalysisSQLs) throws Exception { + long startTime = System.currentTimeMillis(); + LOG.debug("analyze task {} start at {}", info.toString(), new Date()); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { + List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); + for (List group : sqlGroups) { + if (killed) { + return; + } + StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); + group.forEach(partitionCollectSQL::add); + stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); + buf.add(stmtExecutor.executeInternalQuery() + .stream().map(ColStatsData::new).collect(Collectors.toList())); + QueryState queryState = r.connectContext.getState(); + if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, partitionCollectSQL, + queryState.getErrorMessage())); + } + } + for (List colStatsDataList : buf) { + StringBuilder batchInsertSQL = + new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES "); + StringJoiner sj = new StringJoiner(","); + colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); + batchInsertSQL.append(sj); + stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); + executeWithExceptionOnFail(stmtExecutor); + } + } finally { + LOG.debug("analyze task {} end. cost {}ms", info, System.currentTimeMillis() - startTime); + } + + } + private void executeInsertSql(String sql) throws Exception { long startTime = System.currentTimeMillis(); try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { @@ -270,6 +277,7 @@ private Map buildTableStatsParams(String partId) { commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); + commonParams.put("countExpr", getCountExpression()); if (col != null) { commonParams.put("type", col.getType().toString()); } @@ -277,28 +285,39 @@ private Map buildTableStatsParams(String partId) { return commonParams; } - private void setParameterData(Map parameters, Map params) { - String numRows = ""; - String timestamp = ""; - if (parameters.containsKey(NUM_ROWS)) { - numRows = parameters.get(NUM_ROWS); + protected String getCountExpression() { + if (info.samplePercent > 0) { + return String.format("ROUND(COUNT(1) * 100 / %d)", info.samplePercent); + } else { + return "COUNT(1)"; } - if (parameters.containsKey(TIMESTAMP)) { - timestamp = parameters.get(TIMESTAMP); + } + + protected String getNullCountExpression() { + if (info.samplePercent > 0) { + return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 100 / %d)", + info.samplePercent); + } else { + return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)"; + } + } + + protected String getDataSizeFunction(Column column) { + String originFunction = super.getDataSizeFunction(column); + if (info.samplePercent > 0 && !isPartitionOnly) { + return String.format("ROUND((%s) * 100 / %d)", originFunction, info.samplePercent); + } else { + return originFunction; } - params.put("numRows", numRows); - params.put("rowCount", numRows); - params.put("update_time", TimeUtils.DATETIME_FORMAT.format( - LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.parseLong(timestamp) * 1000), - ZoneId.systemDefault()))); } @Override protected void afterExecution() { - if (isTableLevelTask) { - Env.getCurrentEnv().getStatisticsCache().refreshTableStatsSync(catalog.getId(), db.getId(), tbl.getId()); - } else { - Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); + // Table level task doesn't need to sync any value to sync stats, it stores the value in metadata. + // Partition only task doesn't need to refresh cached. + if (isTableLevelTask || isPartitionOnly) { + return; } + Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java new file mode 100644 index 00000000000000..85f2fe45bd5761 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +public class HistData { + + public final StatsId statsId; + + public final double sampleRate; + + public final String buckets; + + public final String updateTime; + + public HistData(ResultRow row) { + this.statsId = new StatsId(row); + this.sampleRate = Double.parseDouble(row.get(7)); + this.buckets = row.get(8); + this.updateTime = row.get(9); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java index 05e2c199ed9083..2068c368c40f52 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java @@ -20,7 +20,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.catalog.Type; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.base.Strings; @@ -62,13 +61,12 @@ public Histogram(Type dataType, double sampleRate, int numBuckets, List public static Histogram fromResultRow(ResultRow resultRow) { try { HistogramBuilder histogramBuilder = new HistogramBuilder(); - - long catalogId = Long.parseLong(resultRow.getColumnValue("catalog_id")); - long idxId = Long.parseLong(resultRow.getColumnValue("idx_id")); - long dbId = Long.parseLong(resultRow.getColumnValue("db_id")); - long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id")); - - String colName = resultRow.getColumnValue("col_id"); + HistData histData = new HistData(resultRow); + long catalogId = histData.statsId.catalogId; + long idxId = histData.statsId.idxId; + long dbId = histData.statsId.dbId; + long tblId = histData.statsId.tblId; + String colName = histData.statsId.colId; Column col = StatisticsUtil.findColumn(catalogId, dbId, tblId, idxId, colName); if (col == null) { LOG.warn("Failed to deserialize histogram statistics, ctlId: {} dbId: {}" @@ -79,10 +77,10 @@ public static Histogram fromResultRow(ResultRow resultRow) { Type dataType = col.getType(); histogramBuilder.setDataType(dataType); - double sampleRate = Double.parseDouble(resultRow.getColumnValue("sample_rate")); + double sampleRate = histData.sampleRate; histogramBuilder.setSampleRate(sampleRate); - String json = resultRow.getColumnValue("buckets"); + String json = histData.buckets; JsonObject jsonObj = JsonParser.parseString(json).getAsJsonObject(); int bucketNum = jsonObj.get("num_buckets").getAsInt(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java index 0e0752409231d6..d9928f2a639261 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java @@ -18,7 +18,6 @@ package org.apache.doris.statistics; import org.apache.doris.common.FeConstants; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.commons.collections.CollectionUtils; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramTask.java index 6b93486c056c23..11274ec79e6808 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramTask.java @@ -46,7 +46,7 @@ public class HistogramTask extends BaseAnalysisTask { + " HISTOGRAM(`${colName}`, ${maxBucketNum}) AS buckets, " + " NOW() AS create_time " + "FROM " - + " `${dbName}`.`${tblName}` ${sampleExpr}"; + + " `${dbName}`.`${tblName}`"; @VisibleForTesting public HistogramTask() { @@ -71,7 +71,6 @@ public void doExecute() throws Exception { params.put("tblName", String.valueOf(info.tblName)); params.put("colName", String.valueOf(info.colName)); params.put("sampleRate", getSampleRateFunction()); - params.put("sampleExpr", getSampleExpression()); params.put("maxBucketNum", String.valueOf(info.maxBucketNum)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index 25b9db0b2c7bde..58be1510b44b10 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -23,7 +23,6 @@ import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.util.InternalQueryResult; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.commons.text.StringSubstitutor; @@ -81,11 +80,11 @@ public void doExecute() throws Exception { */ private void getTableStats() throws Exception { Map params = buildTableStatsParams(null); - List columnResult = + List columnResult = StatisticsUtil.execStatisticQuery(new StringSubstitutor(params).replace(ANALYZE_TABLE_COUNT_TEMPLATE)); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); + String rowCount = columnResult.get(0).get(0); + Env.getCurrentEnv().getAnalysisManager() + .updateTableStatsStatus(new TableStatsMeta(table.getId(), Long.parseLong(rowCount), info)); } /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java index f63a70552cf66c..89d4313cf2f508 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java @@ -118,7 +118,6 @@ public void doExecute() throws Exception { params.put("colName", colName); params.put("tblName", String.valueOf(info.tblName)); params.put("sql", sql); - params.put("sampleExpr", getSampleExpression()); StatisticsUtil.execUpdate(ANALYZE_MV_PART, params); } params.remove("partId"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java new file mode 100644 index 00000000000000..877a4f5bd09364 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import java.util.List; + +public class OlapAnalysisJob { + + + + private List columns; + + private static String collectPartionStatsSQLTemplate = + " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "${partId} AS part_id, " + + "COUNT(1) AS row_count, " + + "NDV(`${colName}`) AS ndv, " + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + + "${dataSizeFunction} AS data_size, " + + "NOW() "; + + + protected void beforeExecution() { + } + + public void execute() { + } + + protected void afterExecution() { + + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 257708de54f78b..868bd1a888213b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -17,43 +17,187 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.Pair; +import org.apache.doris.datasource.InternalCatalog; import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; +import java.security.SecureRandom; import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; /** * Each task analyze one column. */ public class OlapAnalysisTask extends BaseAnalysisTask { - private static final String ANALYZE_PARTITION_SQL_TEMPLATE = INSERT_PART_STATISTICS - + "FROM `${dbName}`.`${tblName}` " - + "PARTITION ${partName} ${sampleExpr}"; - // TODO Currently, NDV is computed for the full table; in fact, // NDV should only be computed for the relevant partition. private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS + " (SELECT NDV(`${colName}`) AS ndv " - + " FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n"; + + " FROM `${dbName}`.`${tblName}`) t2\n"; + + private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE = + " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "${partId} AS part_id, " + + "COUNT(1) AS row_count, " + + "NDV(`${colName}`) AS ndv, " + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + + "${dataSizeFunction} AS data_size, " + + "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName}"; + + private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n" + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n" + + "${catalogId} AS catalog_id, \n" + + "${dbId} AS db_id, \n" + + "${tblId} AS tbl_id, \n" + + "${idxId} AS idx_id, \n" + + "'${colId}' AS col_id, \n" + + "NULL AS part_id, \n" + + "COUNT(1) * ${scaleFactor} AS row_count, \n" + + "NDV(`${colName}`) * ${scaleFactor} AS ndv, \n" + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, \n" + + "MIN(`${colName}`) AS min, \n" + + "MAX(`${colName}`) AS max, \n" + + "${dataSizeFunction} * ${scaleFactor} AS data_size, \n" + + "NOW()\n" + + "FROM `${dbName}`.`${tblName}`\n" + + "${tablets}"; + + // cache stats for each partition, it would be inserted into column_statistics in a batch. + private final List> buf = new ArrayList<>(); + + @VisibleForTesting + public OlapAnalysisTask() { + } public OlapAnalysisTask(AnalysisInfo info) { super(info); } public void doExecute() throws Exception { + + if (tableSample != null) { + doSample(); + } else { + doFull(); + } + } + + /** + * 1. Get col stats in sample ways + * 2. estimate partition stats + * 3. insert col stats and partition stats + */ + protected void doSample() throws Exception { + Pair, Long> pair = calcActualSampleTablets(); + List tabletIds = pair.first; + double scaleFactor = (double) tbl.getRowCount() / (double) pair.second; + // might happen if row count in fe metadata hasn't been updated yet + if (Double.isInfinite(scaleFactor)) { + scaleFactor = 1; + } + String tabletStr = tabletIds.stream() + .map(Object::toString) + .collect(Collectors.joining(", ")); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) { + Map params = new HashMap<>(); + params.put("internalDB", FeConstants.INTERNAL_DB_NAME); + params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); + params.put("catalogId", String.valueOf(catalog.getId())); + params.put("dbId", String.valueOf(db.getId())); + params.put("tblId", String.valueOf(tbl.getId())); + params.put("idxId", String.valueOf(info.indexId)); + params.put("colId", String.valueOf(info.colName)); + params.put("dataSizeFunction", getDataSizeFunction(col)); + params.put("dbName", info.dbName); + params.put("colName", String.valueOf(info.colName)); + params.put("tblName", String.valueOf(info.tblName)); + params.put("scaleFactor", String.valueOf(scaleFactor)); + params.put("tablets", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + stmtExecutor = new StmtExecutor(r.connectContext, stringSubstitutor.replace(SAMPLE_COLUMN_SQL_TEMPLATE)); + // Scalar query only return one row + ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); + OlapTable olapTable = (OlapTable) tbl; + Collection partitions = olapTable.getPartitions(); + int partitionCount = partitions.size(); + List values = partitions.stream().map(p -> String.format( + "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())", + StatisticsUtil.quote(StatisticsUtil.constructId(tbl.getId(), -1, col.getName(), p.getId())), + InternalCatalog.INTERNAL_CATALOG_ID, + db.getId(), + tbl.getId(), + -1, + StatisticsUtil.quote(col.getName()), + p.getId(), + colStatsData.count / partitionCount, + colStatsData.ndv / partitionCount, + colStatsData.nullCount / partitionCount, + StatisticsUtil.quote(colStatsData.minLit), + StatisticsUtil.quote(colStatsData.maxLit), + colStatsData.dataSizeInBytes / partitionCount)).collect(Collectors.toList()); + values.add(String.format( + "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())", + StatisticsUtil.quote(StatisticsUtil.constructId(tbl.getId(), -1, col.getName())), + InternalCatalog.INTERNAL_CATALOG_ID, + db.getId(), + tbl.getId(), + -1, + StatisticsUtil.quote(col.getName()), + "NULL", + colStatsData.count, + colStatsData.ndv, + colStatsData.nullCount, + StatisticsUtil.quote(colStatsData.minLit), + StatisticsUtil.quote(colStatsData.maxLit), + colStatsData.dataSizeInBytes)); + String insertSQL = "INSERT INTO " + + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES " + + String.join(",", values); + stmtExecutor = new StmtExecutor(r.connectContext, insertSQL); + executeWithExceptionOnFail(stmtExecutor); + } + } + + /** + * 1. Get stats of each partition + * 2. insert partition in batch + * 3. calculate column stats based on partition stats + */ + protected void doFull() throws Exception { + Set partitionNames = info.colToPartitions.get(info.colName); + if (partitionNames.isEmpty()) { + return; + } Map params = new HashMap<>(); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); @@ -66,60 +210,128 @@ public void doExecute() throws Exception { params.put("dbName", info.dbName); params.put("colName", String.valueOf(info.colName)); params.put("tblName", String.valueOf(info.tblName)); - params.put("sampleExpr", getSampleExpression()); List partitionAnalysisSQLs = new ArrayList<>(); try { tbl.readLock(); - Set partNames = info.colToPartitions.get(info.colName); - for (String partName : partNames) { - Partition part = tbl.getPartition(partName); + + for (String partitionName : partitionNames) { + Partition part = tbl.getPartition(partitionName); if (part == null) { continue; } - params.put("partId", String.valueOf(tbl.getPartition(partName).getId())); + params.put("partId", String.valueOf(tbl.getPartition(partitionName).getId())); // Avoid error when get the default partition - params.put("partName", "`" + partName + "`"); + params.put("partitionName", "`" + partitionName + "`"); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - partitionAnalysisSQLs.add(stringSubstitutor.replace(ANALYZE_PARTITION_SQL_TEMPLATE)); + partitionAnalysisSQLs.add(stringSubstitutor.replace(COLLECT_PARTITION_STATS_SQL_TEMPLATE)); } } finally { tbl.readUnlock(); } - execSQLs(partitionAnalysisSQLs); - params.remove("partId"); - params.put("type", col.getType().toString()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE); - execSQL(sql); + execSQLs(partitionAnalysisSQLs, params); } @VisibleForTesting - public void execSQLs(List partitionAnalysisSQLs) throws Exception { - for (String sql : partitionAnalysisSQLs) { - execSQL(sql); + public void execSQLs(List partitionAnalysisSQLs, Map params) throws Exception { + long startTime = System.currentTimeMillis(); + LOG.debug("analyze task {} start at {}", info.toString(), new Date()); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) { + List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); + for (List group : sqlGroups) { + if (killed) { + return; + } + StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); + group.forEach(partitionCollectSQL::add); + stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); + buf.add(stmtExecutor.executeInternalQuery() + .stream().map(ColStatsData::new).collect(Collectors.toList())); + QueryState queryState = r.connectContext.getState(); + if (queryState.getStateType().equals(MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, partitionCollectSQL, + queryState.getErrorMessage())); + } + } + for (List colStatsDataList : buf) { + StringBuilder batchInsertSQL = + new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES "); + StringJoiner sj = new StringJoiner(","); + colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); + batchInsertSQL.append(sj.toString()); + stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); + executeWithExceptionOnFail(stmtExecutor); + } + params.put("type", col.getType().toString()); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE); + stmtExecutor = new StmtExecutor(r.connectContext, sql); + executeWithExceptionOnFail(stmtExecutor); + } finally { + LOG.debug("analyze task {} end. cost {}ms", info, + System.currentTimeMillis() - startTime); } } - @VisibleForTesting - public void execSQL(String sql) throws Exception { - if (killed) { - return; + // Get sample tablets id and scale up scaleFactor + protected Pair, Long> calcActualSampleTablets() { + // Below code copied from OlapScanNode.java + long sampleRows; // The total number of sample rows + long totalRows = 0; // The total number of partition rows hit + long totalTablet = 0; // The total number of tablets in the hit partition + OlapTable olapTable = (OlapTable) tbl; + if (tableSample.isPercent()) { + sampleRows = (long) Math.max(olapTable.getRowCount() * (tableSample.getSampleValue() / 100.0), 1); + } else { + sampleRows = Math.max(tableSample.getSampleValue(), 1); } - long startTime = System.currentTimeMillis(); - LOG.info("ANALYZE SQL : " + sql + " start at " + startTime); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { - r.connectContext.getSessionVariable().disableNereidsPlannerOnce(); - stmtExecutor = new StmtExecutor(r.connectContext, sql); - r.connectContext.setExecutor(stmtExecutor); - stmtExecutor.execute(); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(MysqlStateType.ERR)) { - throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", - info.catalogName, info.dbName, info.colName, sql, queryState.getErrorMessage())); + + // calculate the number of tablets by each partition + long avgRowsPerPartition = sampleRows / Math.max(olapTable.getPartitions().size(), 1); + List sampleTabletIds = new ArrayList<>(); + long actualSampledRowCount = 0; + for (Partition p : olapTable.getPartitions()) { + List ids = p.getBaseIndex().getTabletIdsInOrder(); + + if (ids.isEmpty()) { + continue; } - } finally { - LOG.info("Analyze SQL: " + sql + " cost time: " + (System.currentTimeMillis() - startTime) + "ms"); + + // Skip partitions with row count < row count / 2 expected to be sampled per partition. + // It can be expected to sample a smaller number of partitions to avoid uneven distribution + // of sampling results. + if (p.getBaseIndex().getRowCount() < (avgRowsPerPartition / 2)) { + continue; + } + MaterializedIndex baseIndex = p.getBaseIndex(); + long avgRowsPerTablet = Math.max(baseIndex.getRowCount() / ids.size(), 1); + long tabletCounts = Math.max( + avgRowsPerPartition / avgRowsPerTablet + (avgRowsPerPartition % avgRowsPerTablet != 0 ? 1 : 0), 1); + tabletCounts = Math.min(tabletCounts, ids.size()); + long seek = tableSample.getSeek() != -1 + ? tableSample.getSeek() : (long) (new SecureRandom().nextDouble() * ids.size()); + for (int i = 0; i < tabletCounts; i++) { + int seekTid = (int) ((i + seek) % ids.size()); + long tabletId = ids.get(seekTid); + sampleTabletIds.add(tabletId); + actualSampledRowCount += baseIndex.getTablet(tabletId).getRowCount(true); + } + + totalRows += p.getBaseIndex().getRowCount(); + totalTablet += ids.size(); } - } + // all hit, direct full + if (totalRows < sampleRows) { + // can't fill full sample rows + sampleTabletIds.clear(); + } else if (sampleTabletIds.size() == totalTablet) { + // TODO add limit + sampleTabletIds.clear(); + } else if (!sampleTabletIds.isEmpty()) { + // TODO add limit + } + return Pair.of(sampleTabletIds, actualSampledRowCount); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java new file mode 100644 index 00000000000000..9945175a228a93 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import com.google.gson.annotations.SerializedName; + +import java.util.Collections; +import java.util.List; +import java.util.StringJoiner; + +public class ResultRow { + @SerializedName("values") + private final List values; + + public ResultRow(List values) { + this.values = values; + } + + public List getValues() { + return values != null ? values : Collections.emptyList(); + } + + @Override + public String toString() { + StringJoiner sj = new StringJoiner(",", "ResultRow:{", "}"); + for (String val : values) { + sj.add(val); + } + return sj.toString(); + } + + public String get(int idx) { + return values.get(idx); + } + + /** + * If analyze an empty table, some stats would be null, return a default value + * to avoid npe would deserialize it. + */ + public String getWithDefault(int idx, String defaultVal) { + String val = values.get(idx); + return val == null ? defaultVal : val; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index a2194834030b5d..e6b8297d0c0b01 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -17,6 +17,9 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.TableIf; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.FeConstants; import org.apache.doris.system.SystemInfoService; @@ -26,22 +29,16 @@ import java.util.concurrent.TimeUnit; public class StatisticConstants { - public static final String ANALYSIS_TBL_NAME = "table_statistics"; public static final String STATISTIC_TBL_NAME = "column_statistics"; - public static final String HISTOGRAM_TBL_NAME = "histogram_statistics"; public static final int MAX_NAME_LEN = 64; public static final int ID_LEN = 4096; - public static final int STATISTICS_CACHE_VALID_DURATION_IN_HOURS = 24 * 2; - public static final int STATISTICS_CACHE_REFRESH_INTERVAL = 24 * 2; - public static final int ROW_COUNT_CACHE_VALID_DURATION_IN_HOURS = 12; - /** * Bucket count fot column_statistics and analysis_job table. */ @@ -63,26 +60,51 @@ public class StatisticConstants { public static final int HISTOGRAM_MAX_BUCKET_NUM = 128; - /** - * The health of the table indicates the health of the table statistics, rang in [0, 100]. - * Below this threshold will automatically re-collect statistics. TODO make it in fe.conf - */ - public static final int TABLE_STATS_HEALTH_THRESHOLD = 80; - public static final int ANALYZE_MANAGER_INTERVAL_IN_SECS = 60; - public static List STATISTICS_DB_BLACK_LIST = new ArrayList<>(); + public static List SYSTEM_DBS = new ArrayList<>(); + + public static int ANALYZE_TASK_RETRY_TIMES = 5; public static final String DB_NAME = SystemInfoService.DEFAULT_CLUSTER + ":" + FeConstants.INTERNAL_DB_NAME; + public static final String FULL_QUALIFIED_STATS_TBL_NAME = FeConstants.INTERNAL_DB_NAME + "." + STATISTIC_TBL_NAME; + public static final int STATISTIC_INTERNAL_TABLE_REPLICA_NUM = 3; - public static int ANALYZE_TASK_RETRY_TIMES = 3; + public static final int RETRY_LOAD_QUEUE_SIZE = 1000; + + public static final int RETRY_LOAD_THREAD_POOL_SIZE = 1; + + public static final int LOAD_RETRY_TIMES = 3; + + // union more relation than 512 may cause StackOverFlowException in the future. + public static final int UNION_ALL_LIMIT = 512; + + public static final String FULL_AUTO_ANALYZE_START_TIME = "00:00:00"; + public static final String FULL_AUTO_ANALYZE_END_TIME = "23:59:59"; static { - STATISTICS_DB_BLACK_LIST.add(SystemInfoService.DEFAULT_CLUSTER + SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + FeConstants.INTERNAL_DB_NAME); - STATISTICS_DB_BLACK_LIST.add(SystemInfoService.DEFAULT_CLUSTER + SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + "information_schema"); } + + public static boolean isSystemTable(TableIf tableIf) { + if (tableIf instanceof OlapTable) { + OlapTable olapTable = (OlapTable) tableIf; + if (StatisticConstants.SYSTEM_DBS.contains(olapTable.getQualifiedDbName())) { + return true; + } + } + return false; + } + + public static boolean shouldIgnoreCol(TableIf tableIf, Column c) { + if (isSystemTable(tableIf)) { + return true; + } + return !c.isVisible(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java index d3bdbdabd0b0e5..74b77c2ee7c91f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java @@ -17,6 +17,8 @@ package org.apache.doris.statistics; +import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.common.Pair; import org.apache.doris.nereids.types.DataType; import java.util.Objects; @@ -29,20 +31,47 @@ public class StatisticRange { * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) */ private final double low; + + private final LiteralExpr lowExpr; /** * {@code NaN} represents empty range ({@code low} must be {@code NaN} too) */ private final double high; + private final LiteralExpr highExpr; + private final double distinctValues; private final DataType dataType; - public StatisticRange(double low, double high, double distinctValues, DataType dataType) { + private final boolean isEmpty; + + public StatisticRange(double low, LiteralExpr lowExpr, double high, LiteralExpr highExpr, + double distinctValues, DataType dataType) { + this(low, lowExpr, high, highExpr, distinctValues, dataType, false); + } + + private StatisticRange(double low, LiteralExpr lowExpr, double high, LiteralExpr highExpr, + double distinctValues, DataType dataType, boolean isEmpty) { this.low = low; + this.lowExpr = lowExpr; this.high = high; + this.highExpr = highExpr; this.distinctValues = distinctValues; this.dataType = dataType; + this.isEmpty = isEmpty; + } + + public LiteralExpr getLowExpr() { + return lowExpr; + } + + public LiteralExpr getHighExpr() { + return highExpr; + } + + public DataType getDataType() { + return dataType; } public double overlapPercentWith(StatisticRange other) { @@ -79,19 +108,29 @@ public double overlapPercentWith(StatisticRange other) { } public static StatisticRange empty(DataType dataType) { - return new StatisticRange(Double.NaN, Double.NaN, 0, dataType); + return new StatisticRange(Double.NEGATIVE_INFINITY, null, Double.POSITIVE_INFINITY, + null, 0, dataType, true); } public boolean isEmpty() { - return Double.isNaN(low) && Double.isNaN(high); + return isEmpty; } public boolean isBothInfinite() { return Double.isInfinite(low) && Double.isInfinite(high); } - public static StatisticRange from(ColumnStatistic column, DataType dataType) { - return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType); + public boolean isInfinite() { + return Double.isInfinite(low) || Double.isInfinite(high); + } + + public boolean isFinite() { + return Double.isFinite(low) && Double.isFinite(high); + } + + public static StatisticRange from(ColumnStatistic colStats, DataType dataType) { + return new StatisticRange(colStats.minValue, colStats.minExpr, colStats.maxValue, colStats.maxExpr, + colStats.ndv, dataType); } public double getLow() { @@ -107,22 +146,49 @@ public double length() { } public StatisticRange intersect(StatisticRange other) { - double newLow = Math.max(low, other.low); - double newHigh = Math.min(high, other.high); + Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); + double newLow = biggerLow.first; + LiteralExpr newLowExpr = biggerLow.second; + + Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); + double newHigh = smallerHigh.first; + LiteralExpr newHighExpr = smallerHigh.second; if (newLow <= newHigh) { - return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType); + return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, + overlappingDistinctValues(other), dataType); } return empty(dataType); } + public Pair minPair(double r1, LiteralExpr e1, double r2, LiteralExpr e2) { + if (r1 < r2) { + return Pair.of(r1, e1); + } + return Pair.of(r2, e2); + } + + public Pair maxPair(double r1, LiteralExpr e1, double r2, LiteralExpr e2) { + if (r1 > r2) { + return Pair.of(r1, e1); + } + return Pair.of(r2, e2); + } + public StatisticRange cover(StatisticRange other) { - double newLow = Math.max(low, other.low); - double newHigh = Math.min(high, other.high); + // double newLow = Math.max(low, other.low); + // double newHigh = Math.min(high, other.high); + Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); + double newLow = biggerLow.first; + LiteralExpr newLowExpr = biggerLow.second; + Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); + double newHigh = smallerHigh.first; + LiteralExpr newHighExpr = smallerHigh.second; + if (newLow <= newHigh) { double overlapPercentOfLeft = overlapPercentWith(other); double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft); - return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType); + return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, coveredDistinctValues, dataType); } return empty(dataType); } @@ -135,7 +201,10 @@ public StatisticRange union(StatisticRange other) { double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther); double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues) + ((1 - overlapPercentOther) * other.distinctValues); - return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType); + Pair smallerMin = minPair(low, lowExpr, other.low, other.lowExpr); + Pair biggerHigh = maxPair(high, highExpr, other.high, other.highExpr); + return new StatisticRange(smallerMin.first, smallerMin.second, + biggerHigh.first, biggerHigh.second, newNDV, dataType); } private double overlappingDistinctValues(StatisticRange other) { @@ -170,7 +239,4 @@ public double getDistinctValues() { return distinctValues; } - public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) { - return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java index 67dd9bb05432bb..7fe9b03cbcfa70 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java @@ -22,6 +22,7 @@ public enum StatisticalType { AGG_NODE, ANALYTIC_EVAL_NODE, ASSERT_NUM_ROWS_NODE, + CTE_SCAN_NODE, BROKER_SCAN_NODE, NESTED_LOOP_JOIN_NODE, EMPTY_SET_NODE, @@ -54,4 +55,5 @@ public enum StatisticalType { METADATA_SCAN_NODE, JDBC_SCAN_NODE, TEST_EXTERNAL_TABLE, + GROUP_COMMIT_SCAN_NODE } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index 5c628aaba302ca..77c221f5931b38 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -17,17 +17,18 @@ package org.apache.doris.statistics; -import org.apache.doris.nereids.stats.ExpressionEstimation; import org.apache.doris.nereids.stats.StatsMathUtil; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Slot; import java.text.DecimalFormat; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; public class Statistics { - private static int K_BYTES = 1024; + private static final int K_BYTES = 1024; private final double rowCount; @@ -36,33 +37,10 @@ public class Statistics { // the byte size of one tuple private double tupleSize; - @Deprecated - private double width; - - @Deprecated - private double penalty; - - /** - * after filter, compute the new ndv of a column - * @param ndv original ndv of column - * @param newRowCount the row count of table after filter - * @param oldRowCount the row count of table before filter - * @return the new ndv after filter - */ - public static double computeNdv(double ndv, double newRowCount, double oldRowCount) { - if (newRowCount > oldRowCount) { - return ndv; - } - double selectOneTuple = newRowCount / StatsMathUtil.nonZeroDivisor(oldRowCount); - double allTuplesOfSameDistinctValueNotSelected = Math.pow((1 - selectOneTuple), oldRowCount / ndv); - return Math.min(ndv * (1 - allTuplesOfSameDistinctValueNotSelected), newRowCount); - } - public Statistics(Statistics another) { this.rowCount = another.rowCount; this.expressionToColumnStats = new HashMap<>(another.expressionToColumnStats); - this.width = another.width; - this.penalty = another.penalty; + this.tupleSize = another.tupleSize; } public Statistics(double rowCount, Map expressionToColumnStats) { @@ -70,14 +48,6 @@ public Statistics(double rowCount, Map expressionTo this.expressionToColumnStats = expressionToColumnStats; } - public Statistics(double rowCount, Map expressionToColumnStats, double width, - double penalty) { - this.rowCount = rowCount; - this.expressionToColumnStats = expressionToColumnStats; - this.width = width; - this.penalty = penalty; - } - public ColumnStatistic findColumnStatistics(Expression expression) { return expressionToColumnStats.get(expression); } @@ -90,53 +60,46 @@ public double getRowCount() { return rowCount; } - /* - * Return a stats with new rowCount and fix each column stats. - */ public Statistics withRowCount(double rowCount) { - if (Double.isNaN(rowCount)) { - return this; - } - Statistics statistics = new Statistics(rowCount, new HashMap<>(expressionToColumnStats), width, penalty); - statistics.fix(rowCount, StatsMathUtil.nonZeroDivisor(this.rowCount)); - return statistics; + return new Statistics(rowCount, new HashMap<>(expressionToColumnStats)); } /** * Update by count. */ - public Statistics updateRowCountOnly(double rowCount) { + public Statistics withRowCountAndEnforceValid(double rowCount) { Statistics statistics = new Statistics(rowCount, expressionToColumnStats); - for (Entry entry : expressionToColumnStats.entrySet()) { - ColumnStatistic columnStatistic = entry.getValue(); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); - columnStatisticBuilder.setNdv(Math.min(columnStatistic.ndv, rowCount)); - double nullFactor = (rowCount - columnStatistic.numNulls) / rowCount; - columnStatisticBuilder.setNumNulls(nullFactor * rowCount); - columnStatisticBuilder.setCount(rowCount); - statistics.addColumnStats(entry.getKey(), columnStatisticBuilder.build()); - } + statistics.enforceValid(); return statistics; } - /** - * Fix by sel. - */ - public void fix(double newRowCount, double originRowCount) { - double sel = newRowCount / originRowCount; + public void enforceValid() { for (Entry entry : expressionToColumnStats.entrySet()) { ColumnStatistic columnStatistic = entry.getValue(); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); - columnStatisticBuilder.setNdv(computeNdv(columnStatistic.ndv, newRowCount, originRowCount)); - columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls * sel, newRowCount)); - columnStatisticBuilder.setCount(newRowCount); - expressionToColumnStats.put(entry.getKey(), columnStatisticBuilder.build()); + if (!checkColumnStatsValid(columnStatistic)) { + double ndv = Math.min(columnStatistic.ndv, rowCount); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); + columnStatisticBuilder.setNdv(ndv); + columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls, rowCount - ndv)); + columnStatisticBuilder.setCount(rowCount); + columnStatistic = columnStatisticBuilder.build(); + expressionToColumnStats.put(entry.getKey(), columnStatistic); + } } } + public boolean checkColumnStatsValid(ColumnStatistic columnStatistic) { + return columnStatistic.ndv <= rowCount + && columnStatistic.numNulls <= rowCount - columnStatistic.ndv; + } + public Statistics withSel(double sel) { sel = StatsMathUtil.minNonNaN(sel, 1); - return withRowCount(rowCount * sel); + if (Double.isNaN(rowCount)) { + return this; + } + double newCount = rowCount * sel; + return new Statistics(newCount, new HashMap<>(expressionToColumnStats)); } public Statistics addColumnStats(Expression expression, ColumnStatistic columnStatistic) { @@ -144,9 +107,10 @@ public Statistics addColumnStats(Expression expression, ColumnStatistic columnSt return this; } - public Statistics merge(Statistics statistics) { - expressionToColumnStats.putAll(statistics.expressionToColumnStats); - return this; + public boolean isInputSlotsUnknown(Set inputs) { + return inputs.stream() + .allMatch(s -> expressionToColumnStats.containsKey(s) + && expressionToColumnStats.get(s).isUnKnown); } private double computeTupleSize() { @@ -183,53 +147,20 @@ public String toString() { return format.format(rowCount); } - public void setWidth(double width) { - this.width = width; - } - - public void setPenalty(double penalty) { - this.penalty = penalty; - } - - public double getWidth() { - return width; - } - - public double getPenalty() { - return penalty; - } - public int getBENumber() { return 1; } public static Statistics zero(Statistics statistics) { Statistics zero = new Statistics(0, new HashMap<>()); - for (Map.Entry entry : statistics.expressionToColumnStats.entrySet()) { + for (Entry entry : statistics.expressionToColumnStats.entrySet()) { zero.addColumnStats(entry.getKey(), ColumnStatistic.ZERO); } return zero; } - public boolean almostUniqueExpression(Expression expr) { - ExpressionEstimation estimator = new ExpressionEstimation(); - double ndvErrorThreshold = 0.9; - ColumnStatistic colStats = expr.accept(estimator, this); - if (colStats.ndv > colStats.count * ndvErrorThreshold) { - return true; - } - return false; - } - - public boolean isStatsUnknown(Expression expr) { - ExpressionEstimation estimator = new ExpressionEstimation(); - ColumnStatistic colStats = expr.accept(estimator, this); - return colStats.isUnKnown; - } - /** * merge this and other colStats.ndv, choose min - * @param other */ public void updateNdv(Statistics other) { for (Expression expr : expressionToColumnStats.keySet()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java deleted file mode 100644 index aae783ca8b278b..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java +++ /dev/null @@ -1,215 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.analysis.DdlStmt; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.Partition; -import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.Config; -import org.apache.doris.common.DdlException; -import org.apache.doris.common.util.MasterDaemon; -import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.StatisticsUtil; - -import com.google.common.collect.Maps; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -public class StatisticsAutoAnalyzer extends MasterDaemon { - - private static final Logger LOG = LogManager.getLogger(StatisticsAutoAnalyzer.class); - - public StatisticsAutoAnalyzer() { - super("Automatic Analyzer", TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes)); - } - - @Override - protected void runAfterCatalogReady() { - if (!Env.getCurrentEnv().isMaster()) { - return; - } - if (!StatisticsUtil.statsTblAvailable()) { - return; - } - if (Config.enable_auto_collect_statistics) { - analyzePeriodically(); - analyzeAutomatically(); - } - } - - public void autoAnalyzeStats(DdlStmt ddlStmt) { - // TODO Monitor some DDL statements, and then trigger automatic analysis tasks - } - - private void analyzePeriodically() { - try { - AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - List jobInfos = analysisManager.findPeriodicJobs(); - for (AnalysisInfo jobInfo : jobInfos) { - jobInfo = new AnalysisInfoBuilder(jobInfo).setJobType(JobType.SYSTEM).build(); - analysisManager.createAnalysisJob(jobInfo); - } - } catch (DdlException e) { - LOG.warn("Failed to periodically analyze the statistics." + e); - } - } - - private void analyzeAutomatically() { - AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - List jobInfos = analysisManager.findAutomaticAnalysisJobs(); - for (AnalysisInfo jobInfo : jobInfos) { - AnalysisInfo checkedJobInfo = null; - try { - checkedJobInfo = checkAutomaticJobInfo(jobInfo); - if (checkedJobInfo != null) { - analysisManager.createAnalysisJob(checkedJobInfo); - } - } catch (Throwable t) { - LOG.warn("Failed to create analyze job: {}", checkedJobInfo); - } - - } - } - - /** - * Check if automatic analysis of statistics is required. - *

- * Step1: check the health of the table, if the health is good, - * there is no need to re-analyze, or check partition - *

- * Step2: check the partition update time, if the partition is not updated - * after the statistics is analyzed, there is no need to re-analyze - *

- * Step3: if the partition is updated after the statistics is analyzed, - * check the health of the partition, if the health is good, there is no need to re-analyze - * - Step3.1: check the analyzed partition statistics - * - Step3.2: Check for new partitions for which statistics were not analyzed - *

- * TODO new columns is not currently supported to analyze automatically - * - * @param jobInfo analysis job info - * @return new job info after check - * @throws Throwable failed to check - */ - private AnalysisInfo checkAutomaticJobInfo(AnalysisInfo jobInfo) throws Throwable { - long lastExecTimeInMs = jobInfo.lastExecTimeInMs; - TableIf table = StatisticsUtil - .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - TableStatistic tblStats = StatisticsRepository.fetchTableLevelStats(table.getId()); - - if (tblStats == TableStatistic.UNKNOWN) { - LOG.warn("Failed to automatically analyze statistics, " - + "no corresponding table statistics for job: {}", jobInfo.toString()); - throw new DdlException("No corresponding table statistics for automatic job."); - } - - if (!needReanalyzeTable(table, tblStats)) { - return null; - } - - Set needRunPartitions = new HashSet<>(); - Set statsPartitions = jobInfo.colToPartitions.values() - .stream() - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - - checkAnalyzedPartitions(table, statsPartitions, needRunPartitions, lastExecTimeInMs); - checkNewPartitions(table, needRunPartitions, lastExecTimeInMs); - - if (needRunPartitions.isEmpty()) { - return null; - } - - return getAnalysisJobInfo(jobInfo, table, needRunPartitions); - } - - private boolean needReanalyzeTable(TableIf table, TableStatistic tblStats) { - long rowCount = table.getRowCount(); - long updateRows = Math.abs(rowCount - tblStats.rowCount); - int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < StatisticConstants.TABLE_STATS_HEALTH_THRESHOLD; - } - - private void checkAnalyzedPartitions(TableIf table, Set statsPartitions, - Set needRunPartitions, long lastExecTimeInMs) throws DdlException { - for (String statsPartition : statsPartitions) { - Partition partition = table.getPartition(statsPartition); - if (partition == null) { - // Partition that has been deleted also need to - // be reanalyzed (delete partition statistics later) - needRunPartitions.add(statsPartition); - continue; - } - TableStatistic partitionStats = StatisticsRepository - .fetchTableLevelOfPartStats(partition.getId()); - if (partitionStats == TableStatistic.UNKNOWN) { - continue; - } - if (needReanalyzePartition(lastExecTimeInMs, partition, partitionStats)) { - needRunPartitions.add(partition.getName()); - } - } - } - - private boolean needReanalyzePartition(long lastExecTimeInMs, Partition partition, TableStatistic partStats) { - long partUpdateTime = partition.getVisibleVersionTime(); - if (partUpdateTime < lastExecTimeInMs) { - return false; - } - long pRowCount = partition.getBaseIndex().getRowCount(); - long pUpdateRows = Math.abs(pRowCount - partStats.rowCount); - int partHealth = StatisticsUtil.getTableHealth(pRowCount, pUpdateRows); - return partHealth < StatisticConstants.TABLE_STATS_HEALTH_THRESHOLD; - } - - private void checkNewPartitions(TableIf table, Set needRunPartitions, long lastExecTimeInMs) { - Set partitionNames = table.getPartitionNames(); - partitionNames.removeAll(needRunPartitions); - needRunPartitions.addAll( - partitionNames.stream() - .map(table::getPartition) - .filter(partition -> partition.getVisibleVersionTime() >= lastExecTimeInMs) - .map(Partition::getName) - .collect(Collectors.toSet()) - ); - } - - private AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, - Set needRunPartitions) { - Map> newColToPartitions = Maps.newHashMap(); - Map> colToPartitions = jobInfo.colToPartitions; - colToPartitions.keySet().forEach(colName -> { - Column column = table.getColumn(colName); - if (column != null) { - newColToPartitions.put(colName, needRunPartitions); - } - }); - return new AnalysisInfoBuilder(jobInfo) - .setColToPartitions(newColToPartitions).build(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java new file mode 100644 index 00000000000000..fe535b0fb4ab0e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.analysis.TableName; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.catalog.external.ExternalTable; +import org.apache.doris.common.Config; +import org.apache.doris.common.util.TimeUtils; +import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.JobType; +import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.StatisticsUtil; + +import com.google.common.collect.Maps; +import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.time.LocalTime; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class StatisticsAutoCollector extends StatisticsCollector { + + private static final Logger LOG = LogManager.getLogger(StatisticsAutoCollector.class); + + public StatisticsAutoCollector() { + super("Automatic Analyzer", + TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes), + new AnalysisTaskExecutor(Config.full_auto_analyze_simultaneously_running_task_num)); + } + + @Override + protected void collect() { + if (!StatisticsUtil.inAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + analysisTaskExecutor.clear(); + return; + } + if (StatisticsUtil.enableAutoAnalyze()) { + analyzeAll(); + } + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private void analyzeAll() { + Set catalogs = Env.getCurrentEnv().getCatalogMgr().getCopyOfCatalog(); + for (CatalogIf ctl : catalogs) { + if (!ctl.enableAutoAnalyze()) { + continue; + } + Collection dbs = ctl.getAllDbs(); + for (DatabaseIf databaseIf : dbs) { + if (StatisticConstants.SYSTEM_DBS.contains(databaseIf.getFullName())) { + continue; + } + analyzeDb(databaseIf); + } + } + } + + public void analyzeDb(DatabaseIf databaseIf) { + List analysisInfos = constructAnalysisInfo(databaseIf); + for (AnalysisInfo analysisInfo : analysisInfos) { + analysisInfo = getReAnalyzeRequiredPart(analysisInfo); + if (analysisInfo == null) { + continue; + } + try { + createSystemAnalysisJob(analysisInfo); + } catch (Exception e) { + LOG.warn("Failed to create analysis job", e); + } + } + } + + protected List constructAnalysisInfo(DatabaseIf db) { + List analysisInfos = new ArrayList<>(); + for (TableIf table : db.getTables()) { + if (skip(table)) { + continue; + } + createAnalyzeJobForTbl(db, analysisInfos, table); + } + return analysisInfos; + } + + // return true if skip auto analyze this time. + protected boolean skip(TableIf table) { + if (!(table instanceof OlapTable || table instanceof ExternalTable)) { + return true; + } + if (table.getDataSize(true) < Config.huge_table_lower_bound_size_in_bytes) { + return false; + } + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); + return System.currentTimeMillis() - tableStats.updatedTime < Config.huge_table_auto_analyze_interval_in_millis; + } + + protected void createAnalyzeJobForTbl(DatabaseIf db, + List analysisInfos, TableIf table) { + AnalysisMethod analysisMethod = table.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes + ? AnalysisMethod.SAMPLE : AnalysisMethod.FULL; + TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), + table.getName()); + AnalysisInfo jobInfo = new AnalysisInfoBuilder() + .setJobId(Env.getCurrentEnv().getNextId()) + .setCatalogName(db.getCatalog().getName()) + .setDbName(db.getFullName()) + .setTblName(tableName.getTbl()) + .setColName( + table.getBaseSchema().stream().filter(c -> !StatisticsUtil.isUnsupportedType(c.getType())) + .map( + Column::getName).collect(Collectors.joining(",")) + ) + .setAnalysisType(AnalysisInfo.AnalysisType.FUNDAMENTALS) + .setAnalysisMode(AnalysisInfo.AnalysisMode.INCREMENTAL) + .setAnalysisMethod(analysisMethod) + .setSampleRows(Config.huge_table_default_sample_rows) + .setScheduleType(ScheduleType.AUTOMATIC) + .setState(AnalysisState.PENDING) + .setTaskIds(new ArrayList<>()) + .setLastExecTimeInMs(System.currentTimeMillis()) + .setJobType(JobType.SYSTEM).build(); + analysisInfos.add(jobInfo); + } + + @VisibleForTesting + protected AnalysisInfo getReAnalyzeRequiredPart(AnalysisInfo jobInfo) { + TableIf table = StatisticsUtil + .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); + AnalysisManager analysisManager = Env.getServingEnv().getAnalysisManager(); + TableStatsMeta tblStats = analysisManager.findTableStatsStatus(table.getId()); + + if (!table.needReAnalyzeTable(tblStats)) { + return null; + } + + Map> needRunPartitions = table.findReAnalyzeNeededPartitions(); + + if (needRunPartitions.isEmpty()) { + return null; + } + + return new AnalysisInfoBuilder(jobInfo).setColToPartitions(needRunPartitions).build(); + } + + @VisibleForTesting + protected AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, + Set needRunPartitions) { + Map> newColToPartitions = Maps.newHashMap(); + Map> colToPartitions = jobInfo.colToPartitions; + if (colToPartitions == null) { + for (Column c : table.getColumns()) { + if (StatisticsUtil.isUnsupportedType(c.getType())) { + continue; + } + newColToPartitions.put(c.getName(), needRunPartitions); + } + } else { + colToPartitions.keySet().forEach(colName -> { + Column column = table.getColumn(colName); + if (column != null) { + newColToPartitions.put(colName, needRunPartitions); + } + }); + } + return new AnalysisInfoBuilder(jobInfo) + .setColToPartitions(newColToPartitions).build(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java index 3622006542d93d..a0e75f7df38090 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java @@ -26,7 +26,7 @@ public class StatisticsBuilder { private double rowCount; - private Map expressionToColumnStats; + private final Map expressionToColumnStats; public StatisticsBuilder() { expressionToColumnStats = new HashMap<>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java index cb9e4ca322882f..c9b049a8cfc083 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java @@ -24,7 +24,6 @@ import org.apache.doris.ha.FrontendNodeType; import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.qe.ConnectContext; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Frontend; import org.apache.doris.thrift.FrontendService; @@ -34,16 +33,21 @@ import com.github.benmanes.caffeine.cache.AsyncLoadingCache; import com.github.benmanes.caffeine.cache.Caffeine; import org.apache.commons.collections.CollectionUtils; +import org.apache.hudi.common.util.VisibleForTesting; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.time.Duration; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; public class StatisticsCache { @@ -59,7 +63,6 @@ public class StatisticsCache { private final ColumnStatisticsCacheLoader columnStatisticsCacheLoader = new ColumnStatisticsCacheLoader(); private final HistogramCacheLoader histogramCacheLoader = new HistogramCacheLoader(); - private final TableStatisticsCacheLoader tableStatisticsCacheLoader = new TableStatisticsCacheLoader(); private final AsyncLoadingCache> columnStatisticsCache = Caffeine.newBuilder() @@ -75,20 +78,12 @@ public class StatisticsCache { .executor(threadPool) .buildAsync(histogramCacheLoader); - private final AsyncLoadingCache> tableStatisticsCache = - Caffeine.newBuilder() - .maximumSize(Config.stats_cache_size) - .refreshAfterWrite(Duration.ofHours(StatisticConstants.STATISTICS_CACHE_REFRESH_INTERVAL)) - .executor(threadPool) - .buildAsync(tableStatisticsCacheLoader); - { threadPool.submit(() -> { while (true) { try { columnStatisticsCacheLoader.removeExpiredInProgressing(); histogramCacheLoader.removeExpiredInProgressing(); - tableStatisticsCacheLoader.removeExpiredInProgressing(); } catch (Throwable t) { // IGNORE } @@ -141,23 +136,6 @@ public Optional getHistogram(long tblId, long idxId, String colName) return Optional.empty(); } - public Optional getTableStatistics(long catalogId, long dbId, long tableId) { - ConnectContext ctx = ConnectContext.get(); - if (ctx != null && ctx.getSessionVariable().internalSession) { - return Optional.empty(); - } - StatisticsCacheKey k = new StatisticsCacheKey(catalogId, dbId, tableId); - try { - CompletableFuture> f = tableStatisticsCache.get(k); - if (f.isDone()) { - return f.get(); - } - } catch (Exception e) { - LOG.warn("Unexpected exception while returning Histogram", e); - } - return Optional.empty(); - } - public void invalidate(long tblId, long idxId, String colName) { columnStatisticsCache.synchronous().invalidate(new StatisticsCacheKey(tblId, idxId, colName)); } @@ -174,14 +152,6 @@ public void refreshColStatsSync(long catalogId, long dbId, long tblId, long idxI columnStatisticsCache.synchronous().refresh(new StatisticsCacheKey(catalogId, dbId, tblId, idxId, colName)); } - public void invalidateTableStats(long catalogId, long dbId, long tblId) { - tableStatisticsCache.synchronous().invalidate(new StatisticsCacheKey(catalogId, dbId, tblId)); - } - - public void refreshTableStatsSync(long catalogId, long dbId, long tblId) { - tableStatisticsCache.synchronous().refresh(new StatisticsCacheKey(catalogId, dbId, tblId)); - } - public void refreshHistogramSync(long tblId, long idxId, String colName) { histogramCache.synchronous().refresh(new StatisticsCacheKey(tblId, idxId, colName)); } @@ -219,83 +189,113 @@ private void doPreHeat() { if (CollectionUtils.isEmpty(recentStatsUpdatedCols)) { return; } + Map keyToColStats = new HashMap<>(); for (ResultRow r : recentStatsUpdatedCols) { try { - long tblId = Long.parseLong(r.getColumnValue("tbl_id")); - long idxId = Long.parseLong(r.getColumnValue("idx_id")); - String colId = r.getColumnValue("col_id"); + StatsId statsId = new StatsId(r); + long tblId = statsId.tblId; + long idxId = statsId.idxId; + String colId = statsId.colId; final StatisticsCacheKey k = new StatisticsCacheKey(tblId, idxId, colId); final ColumnStatistic c = ColumnStatistic.fromResultRow(r); - c.loadPartitionStats(tblId, idxId, colId); + keyToColStats.put(k, c); putCache(k, c); } catch (Throwable t) { LOG.warn("Error when preheating stats cache", t); } } + try { + loadPartStats(keyToColStats); + } catch (Exception e) { + LOG.warn("Fucka", e); + } } - public void syncLoadColStats(long tableId, long idxId, String colName) { + /** + * Return false if the log of corresponding stats load is failed. + */ + public boolean syncLoadColStats(long tableId, long idxId, String colName) { List columnResults = StatisticsRepository.loadColStats(tableId, idxId, colName); final StatisticsCacheKey k = new StatisticsCacheKey(tableId, idxId, colName); final ColumnStatistic c = ColumnStatistic.fromResultRow(columnResults); if (c == ColumnStatistic.UNKNOWN) { - return; + return false; } putCache(k, c); + if (ColumnStatistic.UNKNOWN == c) { + return false; + } TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest = new TUpdateFollowerStatsCacheRequest(); updateFollowerStatsCacheRequest.key = GsonUtils.GSON.toJson(k); - updateFollowerStatsCacheRequest.colStats = GsonUtils.GSON.toJson(c); + updateFollowerStatsCacheRequest.statsRows = columnResults.stream().map(GsonUtils.GSON::toJson).collect( + Collectors.toList()); for (Frontend frontend : Env.getCurrentEnv().getFrontends(FrontendNodeType.FOLLOWER)) { - if (frontend.getHost().equals(Env.getCurrentEnv().getSelfNode().getHost())) { - // Doesn't need to send request to current node. + if (StatisticsUtil.isMaster(frontend)) { continue; } - TNetworkAddress address = new TNetworkAddress(frontend.getHost(), - frontend.getRpcPort()); - FrontendService.Client client = null; - try { - client = ClientPool.frontendPool.borrowObject(address); - client.updateStatsCache(updateFollowerStatsCacheRequest); - } catch (Throwable t) { - LOG.warn("Failed to sync stats to follower: {}", address, t); - } finally { - if (client != null) { - ClientPool.frontendPool.returnObject(address, client); - } - } + sendStats(frontend, updateFollowerStatsCacheRequest); } - + return true; } - public void putCache(StatisticsCacheKey k, ColumnStatistic c) { - CompletableFuture> f = new CompletableFuture>() { - - @Override - public Optional get() throws InterruptedException, ExecutionException { - return Optional.of(c); + @VisibleForTesting + public void sendStats(Frontend frontend, TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + TNetworkAddress address = new TNetworkAddress(frontend.getHost(), + frontend.getRpcPort()); + FrontendService.Client client = null; + try { + client = ClientPool.frontendPool.borrowObject(address); + client.updateStatsCache(updateFollowerStatsCacheRequest); + } catch (Throwable t) { + LOG.warn("Failed to sync stats to follower: {}", address, t); + } finally { + if (client != null) { + ClientPool.frontendPool.returnObject(address, client); } + } + } - @Override - public boolean isDone() { - return true; - } + public void putCache(StatisticsCacheKey k, ColumnStatistic c) { + CompletableFuture> f = new CompletableFuture>(); + f.obtrudeValue(Optional.of(c)); + columnStatisticsCache.put(k, f); + } - @Override - public boolean complete(Optional value) { - return true; + private void loadPartStats(Map keyToColStats) { + final int batchSize = Config.expr_children_limit; + Set keySet = new HashSet<>(); + for (StatisticsCacheKey statisticsCacheKey : keyToColStats.keySet()) { + if (keySet.size() < batchSize - 1) { + keySet.add(statisticsCacheKey); + } else { + List partStats = StatisticsRepository.loadPartStats(keySet); + addPartStatsToColStats(keyToColStats, partStats); + keySet = new HashSet<>(); } + } + if (!keySet.isEmpty()) { + List partStats = StatisticsRepository.loadPartStats(keySet); + addPartStatsToColStats(keyToColStats, partStats); + } + } - @Override - public Optional join() { - return Optional.of(c); + private void addPartStatsToColStats(Map keyToColStats, + List partsStats) { + for (ResultRow r : partsStats) { + try { + StatsId statsId = new StatsId(r); + long tblId = statsId.tblId; + long idxId = statsId.idxId; + String partId = statsId.partId; + String colId = statsId.colId; + ColumnStatistic partStats = ColumnStatistic.fromResultRow(r); + keyToColStats.get(new StatisticsCacheKey(tblId, idxId, colId)).putPartStats(partId, partStats); + } catch (Throwable t) { + LOG.warn("Failed to deserialized part stats", t); } - }; - if (c.isUnKnown) { - return; } - columnStatisticsCache.put(k, f); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java index 9aaee6bf1d72ba..6521a8b4a5999b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java @@ -27,7 +27,6 @@ import org.apache.doris.common.util.MasterDaemon; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.datasource.InternalCatalog; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.SystemInfoService; @@ -195,31 +194,32 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats, lon pos += StatisticConstants.FETCH_LIMIT; for (ResultRow r : rows) { try { - String id = r.getColumnValue("id"); - long catalogId = Long.parseLong(r.getColumnValue("catalog_id")); + StatsId statsId = new StatsId(r); + String id = statsId.id; + long catalogId = statsId.catalogId; if (!idToCatalog.containsKey(catalogId)) { expiredStats.expiredCatalog.add(catalogId); continue; } - long dbId = Long.parseLong(r.getColumnValue("db_id")); + long dbId = statsId.dbId; if (!idToDb.containsKey(dbId)) { expiredStats.expiredDatabase.add(dbId); continue; } - long tblId = Long.parseLong(r.getColumnValue("tbl_id")); + long tblId = statsId.tblId; if (!idToTbl.containsKey(tblId)) { expiredStats.expiredTable.add(tblId); continue; } - long idxId = Long.parseLong(r.getColumnValue("idx_id")); + long idxId = statsId.idxId; if (idxId != -1 && !idToMVIdx.containsKey(idxId)) { expiredStats.expiredIdxId.add(idxId); continue; } TableIf t = idToTbl.get(tblId); - String colId = r.getColumnValue("col_id"); + String colId = statsId.colId; if (t.getColumn(colId) == null) { expiredStats.ids.add(id); continue; @@ -228,12 +228,11 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats, lon continue; } OlapTable olapTable = (OlapTable) t; - String partIdStr = r.getColumnValue("part_id"); - if (partIdStr == null) { + String partId = statsId.partId; + if (partId == null) { continue; } - long partId = Long.parseLong(partIdStr); - if (!olapTable.getPartitionIds().contains(partId)) { + if (!olapTable.getPartitionIds().contains(Long.parseLong(partId))) { expiredStats.ids.add(id); } } catch (Exception e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java new file mode 100644 index 00000000000000..2d5c48168357fc --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.statistics.util.StatisticsUtil; + +import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.HashMap; +import java.util.Map; + +public abstract class StatisticsCollector extends MasterDaemon { + + private static final Logger LOG = LogManager.getLogger(StatisticsCollector.class); + + protected final AnalysisTaskExecutor analysisTaskExecutor; + + + public StatisticsCollector(String name, long intervalMs, AnalysisTaskExecutor analysisTaskExecutor) { + super(name, intervalMs); + this.analysisTaskExecutor = analysisTaskExecutor; + analysisTaskExecutor.start(); + } + + @Override + protected void runAfterCatalogReady() { + if (!Env.getCurrentEnv().isMaster()) { + return; + } + if (!StatisticsUtil.statsTblAvailable()) { + LOG.info("Stats table not available, skip"); + return; + } + if (Env.isCheckpointThread()) { + return; + } + + if (!analysisTaskExecutor.idle()) { + LOG.info("Analyze tasks those submitted in last time is not finished, skip"); + return; + } + collect(); + } + + protected abstract void collect(); + + // Analysis job created by the system + @VisibleForTesting + protected void createSystemAnalysisJob(AnalysisInfo jobInfo) + throws DdlException { + if (jobInfo.colToPartitions.isEmpty()) { + // No statistics need to be collected or updated + return; + } + + Map analysisTaskInfos = new HashMap<>(); + AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); + analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, false); + if (StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, false); + } + Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTaskInfos); + analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask); + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java new file mode 100644 index 00000000000000..f34ad0f1221de7 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.Config; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class StatisticsPeriodCollector extends StatisticsCollector { + private static final Logger LOG = LogManager.getLogger(StatisticsPeriodCollector.class); + + public StatisticsPeriodCollector() { + super("Automatic Analyzer", + TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes) / 2, + new AnalysisTaskExecutor(Config.period_analyze_simultaneously_running_task_num)); + } + + @Override + protected void collect() { + try { + AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); + List jobInfos = analysisManager.findPeriodicJobs(); + for (AnalysisInfo jobInfo : jobInfos) { + createSystemAnalysisJob(jobInfo); + } + } catch (Exception e) { + LOG.warn("Failed to periodically analyze the statistics." + e); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index d20bb358c1807b..cd3cc67f3c91c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -18,7 +18,6 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.AlterColumnStatsStmt; -import org.apache.doris.analysis.AlterTableStatsStmt; import org.apache.doris.analysis.TableName; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; @@ -28,16 +27,15 @@ import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; import org.apache.doris.statistics.util.DBObjects; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.SystemInfoService; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -95,38 +93,18 @@ public class StatisticsRepository { + " ORDER BY update_time " + "LIMIT ${limit} OFFSET ${offset}"; - private static final String FETCH_STATS_PART_ID = "SELECT col_id, part_id FROM " + private static final String FETCH_STATS_PART_ID = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE tbl_id = ${tblId}" + " AND part_id IS NOT NULL"; - private static final String PERSIST_TABLE_STATS_TEMPLATE = "INSERT INTO " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " VALUES('${id}', ${catalogId}, ${dbId}, ${tblId}, ${indexId}, ${partId}, ${rowCount}," - + " ${lastAnalyzeTimeInMs}, NOW())"; - - private static final String FETCH_TABLE_LEVEL_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE tbl_id = ${tblId}" - + " AND part_id IS NULL"; - - private static final String FETCH_TABLE_LEVEL_PART_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE part_id = ${partId}"; - - - private static final String FETCH_PART_TABLE_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE tbl_id = ${tblId}" - + " AND part_id IS NOT NULL"; - private static final String QUERY_COLUMN_STATISTICS = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE " + "tbl_id=${tblId} AND idx_id=${idxId} AND col_id='${colId}'"; private static final String QUERY_PARTITION_STATISTICS = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE " - + " tbl_id=${tblId} AND idx_id=${idxId} AND col_id='${colId}' " + + " ${inPredicate}" + " AND part_id IS NOT NULL"; public static ColumnStatistic queryColumnStatisticsByName(long tableId, String colName) { @@ -201,8 +179,7 @@ private static String constructId(Object... params) { return stringJoiner.toString(); } - public static void dropStatistics(Set partIds) throws DdlException { - dropStatisticsByPartId(partIds, StatisticConstants.ANALYSIS_TBL_NAME); + public static void dropStatistics(Set partIds) throws DdlException { dropStatisticsByPartId(partIds, StatisticConstants.STATISTIC_TBL_NAME); } @@ -211,18 +188,6 @@ public static void dropStatistics(long tblId, Set colNames) throws DdlEx dropStatisticsByColName(tblId, colNames, StatisticConstants.HISTOGRAM_TBL_NAME); } - public static void dropExternalTableStatistics(long tblId) throws DdlException { - Map params = new HashMap<>(); - String inPredicate = String.format("tbl_id = %s", tblId); - params.put("tblName", StatisticConstants.ANALYSIS_TBL_NAME); - params.put("condition", inPredicate); - try { - StatisticsUtil.execUpdate(new StringSubstitutor(params).replace(DROP_TABLE_STATISTICS_TEMPLATE)); - } catch (Exception e) { - throw new DdlException(e.getMessage(), e); - } - } - public static void dropStatisticsByColName(long tblId, Set colNames, String statsTblName) throws DdlException { Map params = new HashMap<>(); @@ -237,7 +202,7 @@ public static void dropStatisticsByColName(long tblId, Set colNames, Str } } - public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { + public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { Map params = new HashMap<>(); String right = StatisticsUtil.joinElementsToString(partIds, ","); String inPredicate = String.format(" part_id IN (%s)", right); @@ -250,34 +215,6 @@ public static void dropStatisticsByPartId(Set partIds, String statsTblName } } - public static void persistTableStats(Map params) throws Exception { - StatisticsUtil.execUpdate(PERSIST_TABLE_STATS_TEMPLATE, params); - } - - public static void alterTableStatistics(AlterTableStatsStmt alterTableStatsStmt) throws Exception { - TableName tableName = alterTableStatsStmt.getTableName(); - DBObjects objects = StatisticsUtil.convertTableNameToObjects(tableName); - String rowCount = alterTableStatsStmt.getValue(StatsType.ROW_COUNT); - TableStatisticBuilder builder = new TableStatisticBuilder(); - builder.setRowCount(Long.parseLong(rowCount)); - builder.setLastAnalyzeTimeInMs(0); - TableStatistic tableStatistic = builder.build(); - Map params = new HashMap<>(); - String id = StatisticsUtil.constructId(objects.table.getId(), -1); - params.put("id", id); - params.put("catalogId", String.valueOf(objects.catalog.getId())); - params.put("dbId", String.valueOf(objects.db.getId())); - params.put("tblId", String.valueOf(objects.table.getId())); - params.put("indexId", "-1"); - params.put("partId", "NULL"); - params.put("rowCount", String.valueOf(tableStatistic.rowCount)); - params.put("lastAnalyzeTimeInMs", "0"); - StatisticsUtil.execUpdate(PERSIST_TABLE_STATS_TEMPLATE, params); - // TODO update statistics cache - // Env.getCurrentEnv().getStatisticsCache() - // .updateColStatsCache(objects.table.getId(), -1, builder.build()); - } - public static void alterColumnStatistics(AlterColumnStatsStmt alterColumnStatsStmt) throws Exception { TableName tableName = alterColumnStatsStmt.getTableName(); List partitionIds = alterColumnStatsStmt.getPartitionIds(); @@ -359,25 +296,24 @@ public static List fetchStatsFullName(long limit, long offset) { return StatisticsUtil.execStatisticQuery(new StringSubstitutor(params).replace(FETCH_STATS_FULL_NAME)); } - public static Map> fetchColAndPartsForStats(long tblId) { + public static Map> fetchColAndPartsForStats(long tblId) { Map params = Maps.newHashMap(); params.put("tblId", String.valueOf(tblId)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String partSql = stringSubstitutor.replace(FETCH_STATS_PART_ID); List resultRows = StatisticsUtil.execStatisticQuery(partSql); - Map> columnToPartitions = Maps.newHashMap(); + Map> columnToPartitions = Maps.newHashMap(); resultRows.forEach(row -> { try { - String colId = row.getColumnValue("col_id"); - String partId = row.getColumnValue("part_id"); - if (partId == null) { + StatsId statsId = new StatsId(row); + if (statsId.partId == null) { return; } - columnToPartitions.computeIfAbsent(colId, - k -> new HashSet<>()).add(Long.valueOf(partId)); - } catch (NumberFormatException | DdlException e) { + columnToPartitions.computeIfAbsent(String.valueOf(statsId.colId), + k -> new HashSet<>()).add(statsId.partId); + } catch (NumberFormatException e) { LOG.warn("Failed to obtain the column and partition for statistics.", e); } @@ -386,50 +322,6 @@ public static Map> fetchColAndPartsForStats(long tblId) { return columnToPartitions; } - public static TableStatistic fetchTableLevelStats(long tblId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("tblId", String.valueOf(tblId)); - String sql = StatisticsUtil.replaceParams(FETCH_TABLE_LEVEL_STATS_TEMPLATE, params); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - if (resultRows.size() == 1) { - return TableStatistic.fromResultRow(resultRows.get(0)); - } - throw new DdlException("Query result is not as expected: " + sql); - } - - public static TableStatistic fetchTableLevelOfPartStats(long partId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("partId", String.valueOf(partId)); - String sql = StatisticsUtil.replaceParams(FETCH_TABLE_LEVEL_PART_STATS_TEMPLATE, params); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - if (resultRows.size() == 1) { - return TableStatistic.fromResultRow(resultRows.get(0)); - } - throw new DdlException("Query result is not as expected: " + sql); - } - - public static Map fetchTableLevelOfIdPartStats(long tblId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("tblId", String.valueOf(tblId)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(FETCH_PART_TABLE_STATS_TEMPLATE); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - - if (resultRows.size() == 0) { - return Collections.emptyMap(); - } - - Map idToPartitionTableStats = Maps.newHashMap(); - - for (ResultRow resultRow : resultRows) { - long partId = Long.parseLong(resultRow.getColumnValue("part_id")); - TableStatistic partStats = TableStatistic.fromResultRow(resultRow); - idToPartitionTableStats.put(partId, partStats); - } - - return idToPartitionTableStats; - } - public static List loadColStats(long tableId, long idxId, String colName) { Map params = new HashMap<>(); params.put("tblId", String.valueOf(tableId)); @@ -440,12 +332,14 @@ public static List loadColStats(long tableId, long idxId, String colN .replace(QUERY_COLUMN_STATISTICS)); } - public static List loadPartStats(long tableId, long idxId, String colName) { + public static List loadPartStats(Collection keys) { + String inPredicate = "CONCAT(tbl_id, '-', idx_id, '-', col_id) in (%s)"; + StringJoiner sj = new StringJoiner(","); + for (StatisticsCacheKey statisticsCacheKey : keys) { + sj.add("'" + statisticsCacheKey.toString() + "'"); + } Map params = new HashMap<>(); - params.put("tblId", String.valueOf(tableId)); - params.put("idxId", String.valueOf(idxId)); - params.put("colId", colName); - + params.put("inPredicate", String.format(inPredicate, sj.toString())); return StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) .replace(QUERY_PARTITION_STATISTICS)); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java index 6010daa6db4904..8c301f911be95b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java @@ -144,13 +144,6 @@ public StatsDeriveResult updateByLimit(long limit) { return statsDeriveResult; } - public StatsDeriveResult merge(StatsDeriveResult other) { - for (Entry entry : other.getSlotIdToColumnStats().entrySet()) { - this.slotIdToColumnStats.put(entry.getKey(), entry.getValue().copy()); - } - return this; - } - public StatsDeriveResult copy() { return new StatsDeriveResult(this); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java new file mode 100644 index 00000000000000..3f9b2641b75224 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.util.StatisticsUtil; + +import java.util.StringJoiner; + +public class StatsId { + + public final String id; + public final long catalogId; + public final long dbId; + public final long tblId; + public final long idxId; + + public final String colId; + + // nullable + public final String partId; + + public StatsId(ResultRow row) { + this.id = row.get(0); + this.catalogId = Long.parseLong(row.get(1)); + this.dbId = Long.parseLong(row.get(2)); + this.tblId = Long.parseLong(row.get(3)); + this.idxId = Long.parseLong(row.get(4)); + this.colId = row.get(5); + this.partId = row.get(6); + } + + public String toSQL() { + StringJoiner sj = new StringJoiner(","); + sj.add(StatisticsUtil.quote(id)); + sj.add(String.valueOf(catalogId)); + sj.add(String.valueOf(dbId)); + sj.add(String.valueOf(tblId)); + sj.add(String.valueOf(idxId)); + sj.add(StatisticsUtil.quote(colId)); + sj.add(StatisticsUtil.quote(partId)); + return sj.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java deleted file mode 100644 index 28d0c17b561046..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -public class TableStatistic { - - private static final Logger LOG = LogManager.getLogger(TableStatistic.class); - - public static TableStatistic UNKNOWN = new TableStatisticBuilder() - .setRowCount(0).setUpdateTime("NULL").setLastAnalyzeTimeInMs(0L) - .build(); - - public final long rowCount; - public final long lastAnalyzeTimeInMs; - public final String updateTime; - - public TableStatistic(long rowCount, long lastAnalyzeTimeInMs, String updateTime) { - this.rowCount = rowCount; - this.lastAnalyzeTimeInMs = lastAnalyzeTimeInMs; - this.updateTime = updateTime; - } - - // TODO: use thrift - public static TableStatistic fromResultRow(ResultRow resultRow) { - try { - TableStatisticBuilder tableStatisticBuilder = new TableStatisticBuilder(); - long rowCount = Long.parseLong(resultRow.getColumnValue("count")); - String updateTime = resultRow.getColumnValue("update_time"); - long lastAnalyzeTimeInMs = Long - .parseLong(resultRow.getColumnValue("last_analyze_time_in_ms")); - tableStatisticBuilder.setRowCount(rowCount); - tableStatisticBuilder.setLastAnalyzeTimeInMs(lastAnalyzeTimeInMs); - tableStatisticBuilder.setUpdateTime(updateTime); - return tableStatisticBuilder.build(); - } catch (DdlException e) { - LOG.warn("Failed to deserialize table statistics", e); - return TableStatistic.UNKNOWN; - } - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java deleted file mode 100644 index ddb45b824cb1f8..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -public class TableStatisticBuilder { - public long rowCount; - public long lastAnalyzeTimeInMs; - public String updateTime; - - public TableStatisticBuilder() { - } - - public TableStatisticBuilder(TableStatistic tableStatistic) { - this.rowCount = tableStatistic.rowCount; - this.updateTime = tableStatistic.updateTime; - } - - public TableStatisticBuilder setRowCount(long rowCount) { - this.rowCount = rowCount; - return this; - } - - public TableStatisticBuilder setLastAnalyzeTimeInMs(long lastAnalyzeTimeInMs) { - this.lastAnalyzeTimeInMs = lastAnalyzeTimeInMs; - return this; - } - - public TableStatisticBuilder setUpdateTime(String updateTime) { - this.updateTime = updateTime; - return this; - } - - public TableStatistic build() { - return new TableStatistic(rowCount, lastAnalyzeTimeInMs, updateTime); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java deleted file mode 100644 index 953bc9a42742b8..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.DdlException; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Optional; - -public class TableStatisticsCacheLoader extends StatisticsCacheLoader> { - - private static final Logger LOG = LogManager.getLogger(TableStatisticsCacheLoader.class); - - @Override - protected Optional doLoad(StatisticsCacheKey key) { - try { - TableStatistic tableStatistic = StatisticsRepository.fetchTableLevelStats(key.tableId); - if (tableStatistic != TableStatistic.UNKNOWN) { - return Optional.of(tableStatistic); - } - } catch (DdlException e) { - LOG.debug("Fail to get table line number from table_statistics table. " - + "Will try to get from data source.", e); - } - // Get row count by call TableIf interface getRowCount - // when statistic table doesn't contain a record for this table. - try { - TableIf table = Env.getCurrentEnv().getCatalogMgr().getCatalog(key.catalogId) - .getDbOrDdlException(key.dbId).getTableOrAnalysisException(key.tableId); - long rowCount = table.getRowCount(); - long lastAnalyzeTimeInMs = System.currentTimeMillis(); - String updateTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(lastAnalyzeTimeInMs)); - return Optional.of(new TableStatistic(rowCount, lastAnalyzeTimeInMs, updateTime)); - } catch (Exception e) { - LOG.warn(String.format("Fail to get row count for table %d", key.tableId), e); - } - return Optional.empty(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java new file mode 100644 index 00000000000000..17ca61e9da5c52 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.statistics.AnalysisInfo.JobType; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +public class TableStatsMeta implements Writable { + + @SerializedName("tblId") + public final long tblId; + + @SerializedName("idxId") + public final long idxId; + @SerializedName("updatedRows") + public final AtomicLong updatedRows = new AtomicLong(); + + // We would like to analyze tables which queried frequently with higher priority in the future. + @SerializedName("queriedTimes") + public final AtomicLong queriedTimes = new AtomicLong(); + + // Used for external table. + @SerializedName("rowCount") + public final long rowCount; + + @SerializedName("updateTime") + public long updatedTime; + + @SerializedName("colNameToColStatsMeta") + private ConcurrentMap colNameToColStatsMeta = new ConcurrentHashMap<>(); + + @SerializedName("trigger") + public JobType jobType; + + // It's necessary to store these fields separately from AnalysisInfo, since the lifecycle between AnalysisInfo + // and TableStats is quite different. + public TableStatsMeta(long tblId, long rowCount, AnalysisInfo analyzedJob) { + this.tblId = tblId; + this.idxId = -1; + this.rowCount = rowCount; + updateByJob(analyzedJob); + } + + @Override + public void write(DataOutput out) throws IOException { + String json = GsonUtils.GSON.toJson(this); + Text.writeString(out, json); + } + + public static TableStatsMeta read(DataInput dataInput) throws IOException { + String json = Text.readString(dataInput); + TableStatsMeta tableStats = GsonUtils.GSON.fromJson(json, TableStatsMeta.class); + // Might be null counterintuitively, for compatible + if (tableStats.colNameToColStatsMeta == null) { + tableStats.colNameToColStatsMeta = new ConcurrentHashMap<>(); + } + return tableStats; + } + + public long findColumnLastUpdateTime(String colName) { + ColStatsMeta colStatsMeta = colNameToColStatsMeta.get(colName); + if (colStatsMeta == null) { + return 0; + } + return colStatsMeta.updatedTime; + } + + public ColStatsMeta findColumnStatsMeta(String colName) { + return colNameToColStatsMeta.get(colName); + } + + public void removeColumn(String colName) { + colNameToColStatsMeta.remove(colName); + } + + public Set analyzeColumns() { + return colNameToColStatsMeta.keySet(); + } + + public void reset() { + updatedTime = 0; + colNameToColStatsMeta.values().forEach(ColStatsMeta::clear); + } + + public void updateByJob(AnalysisInfo analyzedJob) { + updatedTime = System.currentTimeMillis(); + String colNameStr = analyzedJob.colName; + // colName field AnalyzeJob's format likes: "[col1, col2]", we need to remove brackets here + // TODO: Refactor this later + if (analyzedJob.colName.startsWith("[") && analyzedJob.colName.endsWith("]")) { + colNameStr = colNameStr.substring(1, colNameStr.length() - 1); + } + List cols = Arrays.stream(colNameStr.split(",")).map(String::trim).collect(Collectors.toList()); + for (String col : cols) { + ColStatsMeta colStatsMeta = colNameToColStatsMeta.get(col); + if (colStatsMeta == null) { + colNameToColStatsMeta.put(col, new ColStatsMeta(updatedTime, + analyzedJob.analysisMethod, analyzedJob.analysisType, analyzedJob.jobType, 0)); + } else { + colStatsMeta.updatedTime = updatedTime; + colStatsMeta.analysisType = analyzedJob.analysisType; + colStatsMeta.analysisMethod = analyzedJob.analysisMethod; + } + } + jobType = analyzedJob.jobType; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java new file mode 100644 index 00000000000000..d74b14267d1eca --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +public class TaskStatusWrapper { + + public final AnalysisInfo info; + public final AnalysisState taskState; + public final String message; + public final long time; + + public TaskStatusWrapper(AnalysisInfo info, AnalysisState taskState, String message, long time) { + this.info = info; + this.taskState = taskState; + this.message = message; + this.time = time; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java index 09af38d830a709..40669b6a9396ea 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java @@ -24,7 +24,6 @@ import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.UserIdentity; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; @@ -38,7 +37,7 @@ import org.apache.doris.qe.OriginStatement; import org.apache.doris.qe.QeProcessorImpl; import org.apache.doris.qe.RowBatch; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TQueryOptions; import org.apache.doris.thrift.TResultBatch; @@ -50,9 +49,9 @@ import java.io.StringReader; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import java.util.UUID; -import java.util.stream.Collectors; /** * Execute SQL query statements internally(in FE). Internal-query mainly used for statistics module, @@ -87,7 +86,7 @@ public void setTimeout(int timeout) { * @return Result of the query statement * @throws Exception Errors in parsing or execution */ - public InternalQueryResult query() throws Exception { + public List query() throws Exception { // step1: mock connectContext buildContext(); @@ -180,14 +179,9 @@ private void execute() throws Exception { } } - private InternalQueryResult fetchResult() { + private List fetchResult() { List columns = stmt.getColLabels(); - List types = stmt.getResultExprs().stream() - .map(e -> e.getType().getPrimitiveType()) - .collect(Collectors.toList()); - - InternalQueryResult result = new InternalQueryResult(); - List resultRows = result.getResultRows(); + List resultRows = new ArrayList<>(); for (TResultBatch batch : resultBatches) { List rows = batch.getRows(); @@ -200,12 +194,11 @@ private InternalQueryResult fetchResult() { values.add(value); } - ResultRow resultRow = new ResultRow(columns, types, values); + ResultRow resultRow = new ResultRow(values); resultRows.add(resultRow); } } - - return result; + return resultRows; } public void cancel() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java deleted file mode 100644 index e7919860107531..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java +++ /dev/null @@ -1,242 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics.util; - -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.common.DdlException; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * Readable results of internal SQL execution, - * providing some read operations. - */ -public class InternalQueryResult { - private final List resultRows = Lists.newArrayList(); - - public InternalQueryResult() { - } - - public List getResultRows() { - return resultRows; - } - - public static class ResultRow { - private final List columns; - private final List types; - private final List values; - - private final Map columnNameMap = Maps.newHashMap(); - private final Map columnIndexMap = Maps.newHashMap(); - - public ResultRow(List columns, List types, List values) { - this.columns = columns; - this.types = types; - this.values = values; - buildColumnNameMap(); - buildColumnIndexMap(); - } - - public List getColumns() { - return columns != null ? columns : Collections.emptyList(); - } - - public List getTypes() { - return types != null ? types : Collections.emptyList(); - } - - public List getValues() { - return values != null ? values : Collections.emptyList(); - } - - private void buildColumnNameMap() { - List columns = getColumns(); - for (int i = 0; i < columns.size(); i++) { - columnNameMap.put(columns.get(i), i); - } - } - - private void buildColumnIndexMap() { - List columns = getColumns(); - for (int i = 0; i < columns.size(); i++) { - columnIndexMap.put(i, columns.get(i)); - } - } - - public int getColumnIndex(String columnName) { - return columnNameMap.getOrDefault(columnName, -1); - } - - public String getColumnName(int index) throws DdlException { - List columns = getColumns(); - if (columnIndexMap.containsKey(index)) { - return columnIndexMap.get(index); - } else { - throw new DdlException("Index should be between 0 and " + columns.size()); - } - } - - public PrimitiveType getColumnType(String columnName) throws DdlException { - List types = getTypes(); - int index = getColumnIndex(columnName); - if (index == -1) { - throw new DdlException(String.format("The column name:[%s] does not exist.", columnName)); - } - return types.get(index); - } - - public PrimitiveType getColumnType(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - return types.get(index); - } else { - throw new DdlException("Index should be between 0 and " + types.size()); - } - } - - public String getColumnValue(String columnName) throws DdlException { - int index = getColumnIndex(columnName); - if (index == -1) { - throw new DdlException(String.format("The column name:[%s] does not exist.", columnName)); - } - return values.get(index); - } - - public String getColumnValueWithDefault(String columnName, String defaultVal) throws DdlException { - String val = getColumnValue(columnName); - return val == null ? defaultVal : val; - } - - public Object getColumnValue(int index) throws DdlException { - List columns = getColumns(); - if (index >= 0 && index < columns.size()) { - return values.get(index); - } else { - throw new DdlException("Index should be between 0 and " + columns.size()); - } - } - - public String getString(int index) throws DdlException { - List columns = getColumns(); - if (index >= 0 && index < columns.size()) { - return values.get(index); - } - throw new DdlException("Index should be between 0 and " + columns.size()); - } - - public int getInt(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - switch (type) { - case BOOLEAN: - case TINYINT: - case SMALLINT: - case INT: - case BIGINT: - return new Integer(value); - default: - throw new DdlException("Unable to convert field to int: " + value); - } - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public long getLong(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - switch (type) { - case TINYINT: - case SMALLINT: - case INT: - case BIGINT: - return Long.parseLong(value); - default: - throw new DdlException("Unable to convert field to long: " + value); - } - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public float getFloat(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - if (type == PrimitiveType.FLOAT) { - return Float.parseFloat(value); - } - throw new DdlException("Unable to convert field to float: " + value); - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public double getDouble(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - if (type == PrimitiveType.DOUBLE) { - return Double.parseDouble(value); - } - throw new DdlException("Unable to convert field to long: " + value); - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("ResultRow{ "); - if (values != null && values.size() > 0) { - List columns = getColumns(); - for (int i = 0; i < values.size(); i++) { - sb.append(columns.get(i)); - sb.append(":"); - sb.append(values.get(i)); - sb.append(" "); - } - } - sb.append("}"); - return sb.toString(); - } - } - - @Override - public String toString() { - if (resultRows.size() > 0) { - StringBuilder sb = new StringBuilder(); - sb.append("InternalQueryResult:\n"); - for (ResultRow resultRow : resultRows) { - sb.append(" - "); - sb.append(resultRow.toString()); - sb.append("\n"); - } - return sb.toString(); - } - return "InternalQueryResult{" + "resultRows=" + resultRows + '}'; - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java new file mode 100644 index 00000000000000..5740c4e30885a3 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics.util; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.function.Function; + +// Any operation on this structure should be thread-safe +public class SimpleQueue extends LinkedList { + + private final long limit; + + private final Function offerFunc; + + private final Function evictFunc; + + + public SimpleQueue(long limit, Function offerFunc, Function evictFunc) { + this.limit = limit; + this.offerFunc = offerFunc; + this.evictFunc = evictFunc; + } + + @Override + public synchronized boolean offer(T analysisInfo) { + while (size() >= limit) { + remove(); + } + super.offer(analysisInfo); + offerFunc.apply(analysisInfo); + return true; + } + + @Override + public synchronized T remove() { + T analysisInfo = super.remove(); + evictFunc.apply(analysisInfo); + return analysisInfo; + } + + public SimpleQueue(long limit, Function offerFunc, Function evictFunc, Collection collection) { + this(limit, offerFunc, evictFunc); + if (collection != null) { + for (T e : collection) { + offer(e); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 89a5ae1f3e6866..40ae13a0e0e293 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -25,10 +25,12 @@ import org.apache.doris.analysis.IntLiteral; import org.apache.doris.analysis.LargeIntLiteral; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.SetType; import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.StringLiteral; import org.apache.doris.analysis.TableName; import org.apache.doris.analysis.UserIdentity; +import org.apache.doris.analysis.VariableExpr; import org.apache.doris.catalog.ArrayType; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; @@ -44,10 +46,12 @@ import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.VariantType; +import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.datasource.HMSExternalCatalog; @@ -61,12 +65,13 @@ import org.apache.doris.qe.QueryState; import org.apache.doris.qe.SessionVariable; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.qe.VariableMgr; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Histogram; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.statistics.StatisticConstants; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.system.Frontend; import org.apache.doris.system.SystemInfoService; import com.google.common.base.Preconditions; @@ -82,9 +87,12 @@ import org.apache.iceberg.types.Types; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.thrift.TException; +import java.net.InetSocketAddress; import java.text.SimpleDateFormat; +import java.time.LocalTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -94,6 +102,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.StringJoiner; import java.util.function.Function; import java.util.stream.Collectors; @@ -102,7 +111,6 @@ public class StatisticsUtil { private static final Logger LOG = LogManager.getLogger(StatisticsUtil.class); private static final String ID_DELIMITER = "-"; - private static final String VALUES_DELIMITER = ","; private static final String TOTAL_SIZE = "totalSize"; private static final String NUM_ROWS = "numRows"; @@ -142,16 +150,6 @@ public static QueryState execUpdate(String sql) throws Exception { } } - public static List deserializeToAnalysisJob(List resultBatches) - throws TException { - if (CollectionUtils.isEmpty(resultBatches)) { - return Collections.emptyList(); - } - return resultBatches.stream() - .map(AnalysisInfo::fromResultRow) - .collect(Collectors.toList()); - } - public static ColumnStatistic deserializeToColumnStatistics(List resultBatches) throws Exception { if (CollectionUtils.isEmpty(resultBatches)) { @@ -166,15 +164,22 @@ public static List deserializeToHistogramStatistics(List r } public static AutoCloseConnectContext buildConnectContext() { + return buildConnectContext(false); + } + + public static AutoCloseConnectContext buildConnectContext(boolean limitScan) { ConnectContext connectContext = new ConnectContext(); SessionVariable sessionVariable = connectContext.getSessionVariable(); sessionVariable.internalSession = true; sessionVariable.setMaxExecMemByte(Config.statistics_sql_mem_limit_in_bytes); + sessionVariable.cpuResourceLimit = Config.cpu_resource_limit_per_analyze_task; sessionVariable.setEnableInsertStrict(true); + sessionVariable.enablePageCache = false; sessionVariable.parallelExecInstanceNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.parallelPipelineTaskNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.setEnableNereidsPlanner(false); sessionVariable.enableProfile = false; + sessionVariable.enableScanRunSerial = limitScan; sessionVariable.queryTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; sessionVariable.insertTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; sessionVariable.enableFileCache = false; @@ -216,7 +221,7 @@ public static LiteralExpr readableValue(Type type, String columnValue) throws An case DOUBLE: return new FloatLiteral(columnValue); case DECIMALV2: - //no need to check precision and scale, since V2 is fixed point + // no need to check precision and scale, since V2 is fixed point return new DecimalLiteral(columnValue); case DECIMAL32: case DECIMAL64: @@ -392,11 +397,12 @@ public static boolean statsTblAvailable() { .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, dbName, StatisticConstants.STATISTIC_TBL_NAME)); - statsTbls.add( - (OlapTable) StatisticsUtil - .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, - dbName, - StatisticConstants.HISTOGRAM_TBL_NAME)); + // uncomment it when hist is available for user. + // statsTbls.add( + // (OlapTable) StatisticsUtil + // .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, + // dbName, + // StatisticConstants.HISTOGRAM_TBL_NAME)); } catch (Throwable t) { return false; } @@ -430,6 +436,15 @@ public static Map getPartitionIdToName(TableIf table) { )); } + public static Set getPartitionIds(TableIf table) { + if (table instanceof OlapTable) { + return ((OlapTable) table).getPartitionIds().stream().map(String::valueOf).collect(Collectors.toSet()); + } else if (table instanceof ExternalTable) { + return table.getPartitionNames(); + } + throw new RuntimeException(String.format("Not supported Table %s", table.getClass().getName())); + } + public static String joinElementsToString(Collection values, String delimiter) { StringJoiner builder = new StringJoiner(delimiter); values.forEach(v -> builder.add(String.valueOf(v))); @@ -475,9 +490,9 @@ public static String replaceParams(String template, Map params) * when update_rows < row_count, the health degree is 100 (1 - update_rows row_count). * * @param updatedRows The number of rows updated by the table - * @return Health, the value range is [0, 100], the larger the value, * @param totalRows The current number of rows in the table - * the healthier the statistics of the table + * the healthier the statistics of the table + * @return Health, the value range is [0, 100], the larger the value, */ public static int getTableHealth(long totalRows, long updatedRows) { if (updatedRows >= totalRows) { @@ -491,19 +506,25 @@ public static int getTableHealth(long totalRows, long updatedRows) { /** * Estimate hive table row count. * First get it from remote table parameters. If not found, estimate it : totalSize/estimatedRowSize + * * @param table Hive HMSExternalTable to estimate row count. + * @param isInit Flag to indicate if this is called during init. To avoid recursively get schema. * @return estimated row count */ - public static long getHiveRowCount(HMSExternalTable table) { + public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { Map parameters = table.getRemoteTable().getParameters(); if (parameters == null) { return -1; } // Table parameters contains row count, simply get and return it. if (parameters.containsKey(NUM_ROWS)) { - return Long.parseLong(parameters.get(NUM_ROWS)); + long rows = Long.parseLong(parameters.get(NUM_ROWS)); + // Sometimes, the NUM_ROWS in hms is 0 but actually is not. Need to check TOTAL_SIZE if NUM_ROWS is 0. + if (rows != 0) { + return rows; + } } - if (!parameters.containsKey(TOTAL_SIZE)) { + if (!parameters.containsKey(TOTAL_SIZE) || isInit) { return -1; } // Table parameters doesn't contain row count but contain total size. Estimate row count : totalSize/rowSize @@ -521,6 +542,7 @@ public static long getHiveRowCount(HMSExternalTable table) { /** * Estimate iceberg table row count. * Get the row count by adding all task file recordCount. + * * @param table Iceberg HMSExternalTable to estimate row count. * @return estimated row count */ @@ -544,6 +566,7 @@ public static long getIcebergRowCount(HMSExternalTable table) { /** * Estimate hive table row count : totalFileSize/estimatedRowSize + * * @param table Hive HMSExternalTable to estimate row count. * @return estimated row count */ @@ -618,6 +641,7 @@ public static long getRowCountFromFileList(HMSExternalTable table) { /** * Get Iceberg column statistics. + * * @param colName * @param table Iceberg table. * @return Optional Column statistic for the given column. @@ -626,8 +650,8 @@ public static Optional getIcebergColumnStats(String colName, or TableScan tableScan = table.newScan().includeColumnStats(); ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); columnStatisticBuilder.setCount(0); - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); columnStatisticBuilder.setDataSize(0); columnStatisticBuilder.setAvgSizeByte(0); columnStatisticBuilder.setNumNulls(0); @@ -642,7 +666,7 @@ public static Optional getIcebergColumnStats(String colName, or } private static void processDataFile(DataFile dataFile, PartitionSpec partitionSpec, - String colName, ColumnStatisticBuilder columnStatisticBuilder) { + String colName, ColumnStatisticBuilder columnStatisticBuilder) { int colId = -1; for (Types.NestedField column : partitionSpec.schema().columns()) { if (column.name().equals(colName)) { @@ -678,4 +702,87 @@ public static void sleep(long millis) { // IGNORE } } + + public static String quote(String str) { + return "'" + str + "'"; + } + + public static boolean isMaster(Frontend frontend) { + InetSocketAddress socketAddress = new InetSocketAddress(frontend.getHost(), frontend.getEditLogPort()); + return Env.getCurrentEnv().getHaProtocol().getLeader().equals(socketAddress); + } + + public static String escapeSQL(String str) { + if (str == null) { + return null; + } + return org.apache.commons.lang3.StringUtils.replace(str, "'", "''"); + } + + public static boolean isExternalTable(String catalogName, String dbName, String tblName) { + TableIf table; + try { + table = StatisticsUtil.findTable(catalogName, dbName, tblName); + } catch (Throwable e) { + LOG.warn(e.getMessage()); + return false; + } + return table instanceof ExternalTable; + } + + public static boolean inAnalyzeTime(LocalTime now) { + try { + Pair range = findRangeFromGlobalSessionVar(); + if (range == null) { + return false; + } + LocalTime start = range.first; + LocalTime end = range.second; + if (start.isAfter(end) && (now.isAfter(start) || now.isBefore(end))) { + return true; + } else { + return now.isAfter(start) && now.isBefore(end); + } + } catch (DateTimeParseException e) { + LOG.warn("Parse analyze start/end time format fail", e); + return true; + } + } + + private static Pair findRangeFromGlobalSessionVar() { + try { + String startTime = + findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) + .fullAutoAnalyzeStartTime; + // For compatibility + if (StringUtils.isEmpty(startTime)) { + startTime = StatisticConstants.FULL_AUTO_ANALYZE_START_TIME; + } + String endTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) + .fullAutoAnalyzeEndTime; + if (StringUtils.isEmpty(startTime)) { + endTime = StatisticConstants.FULL_AUTO_ANALYZE_END_TIME; + } + DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + return Pair.of(LocalTime.parse(startTime, timeFormatter), LocalTime.parse(endTime, timeFormatter)); + } catch (Exception e) { + return null; + } + } + + private static SessionVariable findRangeFromGlobalSessionVar(String varName) throws Exception { + SessionVariable sessionVariable = VariableMgr.newSessionVariable(); + VariableExpr variableExpr = new VariableExpr(varName, SetType.GLOBAL); + VariableMgr.getValue(sessionVariable, variableExpr); + return sessionVariable; + } + + public static boolean enableAutoAnalyze() { + try { + return findRangeFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; + } catch (Exception e) { + LOG.warn("Fail to get value of enable auto analyze, return false by default", e); + } + return false; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java index feeb971b15ee69..a4062d2edff9c5 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java @@ -20,6 +20,7 @@ import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.datasets.tpch.TPCHTestBase; import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalProject; import org.apache.doris.nereids.util.HyperGraphBuilder; import org.apache.doris.nereids.util.MemoTestUtils; import org.apache.doris.nereids.util.PlanChecker; @@ -32,23 +33,37 @@ public class OtherJoinTest extends TPCHTestBase { @Test - public void randomTest() { + public void test() { + for (int t = 3; t < 10; t++) { + for (int e = t - 1; e <= (t * (t - 1)) / 2; e++) { + for (int i = 0; i < 10; i++) { + System.out.println(String.valueOf(t) + " " + e + ": " + i); + randomTest(t, e); + } + } + } + } + + private void randomTest(int tableNum, int edgeNum) { HyperGraphBuilder hyperGraphBuilder = new HyperGraphBuilder(); Plan plan = hyperGraphBuilder - .randomBuildPlanWith(10, 20); - Set> res1 = hyperGraphBuilder.evaluate(plan); + .randomBuildPlanWith(tableNum, edgeNum); + plan = new LogicalProject(plan.getOutput(), plan); + Set> res1 = hyperGraphBuilder.evaluate(plan); CascadesContext cascadesContext = MemoTestUtils.createCascadesContext(connectContext, plan); hyperGraphBuilder.initStats(cascadesContext); Plan optimizedPlan = PlanChecker.from(cascadesContext) - .dpHypOptimize() - .getBestPlanTree(); + .dpHypOptimize() + .getBestPlanTree(); - Set> res2 = hyperGraphBuilder.evaluate(optimizedPlan); + Set> res2 = hyperGraphBuilder.evaluate(optimizedPlan); if (!res1.equals(res2)) { - System.out.println(res1); - System.out.println(res2); System.out.println(plan.treeString()); System.out.println(optimizedPlan.treeString()); + cascadesContext = MemoTestUtils.createCascadesContext(connectContext, plan); + PlanChecker.from(cascadesContext).dpHypOptimize().getBestPlanTree(); + System.out.println(res1); + System.out.println(res2); } Assertions.assertTrue(res1.equals(res2)); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 1fe5e5b0a0e6f6..31affe06252bd8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.IntLiteral; import org.apache.doris.nereids.trees.expressions.And; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -75,7 +76,7 @@ public void testOrNaN() { Statistics expected = filterEstimation.estimate(or, stat); Assertions.assertTrue( Precision.equals(expected.getRowCount(), 750, - 0.01)); + 0.01)); } // a > 500 and b < 100 @@ -132,12 +133,12 @@ public void testNotInNaN() { Map slotToColumnStat = new HashMap<>(); ColumnStatisticBuilder builder = new ColumnStatisticBuilder() .setNdv(500) - .setIsUnknown(true); + .setIsUnknown(false); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(notIn, stat); - Assertions.assertTrue(Precision.equals(666.666, expected.getRowCount(), 0.01)); + Assertions.assertTrue(Precision.equals(1000, expected.getRowCount(), 0.01)); } /** @@ -165,7 +166,6 @@ public void testRelatedAnd() { ColumnStatistic aStatsEst = result.findColumnStatistics(a); Assertions.assertEquals(100, aStatsEst.minValue); Assertions.assertEquals(200, aStatsEst.maxValue); - Assertions.assertEquals(1.0, aStatsEst.selectivity); Assertions.assertEquals(10, aStatsEst.ndv); } @@ -198,7 +198,7 @@ public void test1() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(or, stat); - Assertions.assertEquals(51, expected.getRowCount(), 0.1); + Assertions.assertEquals(51.9, expected.getRowCount(), 0.1); } // a > 500 and b < 100 or a > c @@ -418,7 +418,9 @@ public void test10() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1) - .setMaxValue(10); + .setMinExpr(new IntLiteral(1)) + .setMaxValue(10) + .setMaxExpr(new IntLiteral(10)); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); @@ -467,22 +469,19 @@ public void test12() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1000) - .setMaxValue(10000) - .setSelectivity(1.0); + .setMaxValue(10000); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(200) - .setSelectivity(1.0); + .setMaxValue(200); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -504,7 +503,7 @@ public void test12() { * filter range has intersection with (c.min, c.max) * rows = 100 * a primary key, a.ndv reduced by 1/4 - * b normal field, b.ndv=20 => + * b normal field, b.ndv=20 * c.ndv = 10/40 * c.ndv */ @Test @@ -524,22 +523,19 @@ public void testFilterInsideMinMax() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setSelectivity(1.0); + .setMaxValue(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -554,25 +550,21 @@ public void testFilterInsideMinMax() { Assertions.assertEquals(100, statsA.maxValue); ColumnStatistic statsB = estimated.findColumnStatistics(b); - Assertions.assertEquals(15.6, statsB.ndv, 0.1); + Assertions.assertEquals(20, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); - Assertions.assertEquals(1.0, statsB.selectivity); ColumnStatistic statsC = estimated.findColumnStatistics(c); Assertions.assertEquals(10, statsC.ndv); Assertions.assertEquals(10, statsC.minValue); Assertions.assertEquals(20, statsC.maxValue); - Assertions.assertEquals(1.0, statsC.selectivity); } /** * test filter estimation, c > 300, where 300 is out of c's range (0,200) * after filter - * c.selectivity=a.selectivity=b.selectivity = 0 * c.ndv=a.ndv=b.ndv=0 - * a.ndv = b.ndv = 0 */ @Test @@ -587,23 +579,23 @@ public void testFilterOutofMinMax() { .setNdv(1000) .setAvgSizeByte(4) .setNumNulls(0) - .setMinValue(10000) - .setMaxValue(1000) - .setSelectivity(1.0); + .setMinValue(1000) + .setMaxValue(10000) + .setCount(1000); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(500) - .setSelectivity(1.0); + .setCount(1000); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(200) - .setSelectivity(1.0); + .setCount(1000); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -616,8 +608,8 @@ public void testFilterOutofMinMax() { Assertions.assertEquals(0, statsB.ndv); ColumnStatistic statsC = estimated.findColumnStatistics(c); Assertions.assertEquals(0, statsC.ndv); - Assertions.assertTrue(Double.isNaN(statsC.minValue)); - Assertions.assertTrue(Double.isNaN(statsC.maxValue)); + Assertions.assertTrue(Double.isInfinite(statsC.minValue)); + Assertions.assertTrue(Double.isInfinite(statsC.maxValue)); } /** @@ -660,22 +652,19 @@ public void testInPredicateEstimationForColumns() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setSelectivity(1.0); + .setMaxValue(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -690,7 +679,7 @@ public void testInPredicateEstimationForColumns() { Assertions.assertEquals(5, statsA.ndv, 0.1); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(4.5, statsB.ndv, 0.1); + Assertions.assertEquals(5, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(2, statsC.ndv); @@ -732,7 +721,6 @@ public void testInPredicateEstimationForColumnsOutofRange() { .setNumNulls(0) .setMinValue(0) .setMaxValue(100) - .setSelectivity(1.0) .setCount(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setCount(100) @@ -740,16 +728,14 @@ public void testInPredicateEstimationForColumnsOutofRange() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setCount(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -767,7 +753,7 @@ public void testInPredicateEstimationForColumnsOutofRange() { Assertions.assertEquals(5, statsA.ndv, 0.1); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(4.5, statsB.ndv, 0.1); + Assertions.assertEquals(5, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(2, statsC.ndv); @@ -804,7 +790,6 @@ public void testFilterEstimationForColumnsNotChanged() { .setNumNulls(0) .setMinValue(0) .setMaxValue(100) - .setSelectivity(1.0) .setCount(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setCount(100) @@ -812,16 +797,14 @@ public void testFilterEstimationForColumnsNotChanged() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setCount(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -836,7 +819,7 @@ public void testFilterEstimationForColumnsNotChanged() { Assertions.assertEquals(75, statsA.ndv); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(19.9, statsB.ndv, 0.1); + Assertions.assertEquals(20, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(30, statsC.ndv); @@ -853,7 +836,6 @@ public void testBetweenCastFilter() { .setNumNulls(0) .setMaxValue(100) .setMinValue(0) - .setSelectivity(1.0) .setCount(100); DoubleLiteral begin = new DoubleLiteral(40.0); DoubleLiteral end = new DoubleLiteral(50.0); @@ -881,7 +863,6 @@ public void testDateRangeSelectivity() { .setNumNulls(0) .setMaxValue(to.getDouble()) .setMinValue(from.getDouble()) - .setSelectivity(1.0) .setCount(100); DateLiteral mid = new DateLiteral("1999-01-01"); GreaterThan greaterThan = new GreaterThan(a, mid); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java index 612daae8739b8d..e33c28ae933950 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.util; +import org.apache.doris.catalog.Env; import org.apache.doris.common.Pair; import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.jobs.joinorder.JoinOrderJob; @@ -26,6 +27,7 @@ import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.logical.LogicalJoin; @@ -35,6 +37,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalOlapScan; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.Statistics; +import org.apache.doris.statistics.StatisticsCacheKey; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -58,14 +61,14 @@ public class HyperGraphBuilder { private final HashMap plans = new HashMap<>(); private final HashMap> schemas = new HashMap<>(); - private final ImmutableList fullJoinTypes = ImmutableList.of( + private ImmutableList fullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, JoinType.FULL_OUTER_JOIN ); - private final ImmutableList leftFullJoinTypes = ImmutableList.of( + private ImmutableList leftFullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, @@ -75,7 +78,7 @@ public class HyperGraphBuilder { JoinType.NULL_AWARE_LEFT_ANTI_JOIN ); - private final ImmutableList rightFullJoinTypes = ImmutableList.of( + private ImmutableList rightFullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, @@ -84,12 +87,32 @@ public class HyperGraphBuilder { JoinType.RIGHT_ANTI_JOIN ); + public HyperGraphBuilder() {} + + public HyperGraphBuilder(Set validJoinType) { + fullJoinTypes = fullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + leftFullJoinTypes = leftFullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + rightFullJoinTypes = rightFullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + } + public HyperGraph build() { assert plans.size() == 1 : "there are cross join"; Plan plan = plans.values().iterator().next(); return buildHyperGraph(plan); } + public Plan buildPlan() { + assert plans.size() == 1 : "there are cross join"; + Plan plan = plans.values().iterator().next(); + return plan; + } + public Plan buildJoinPlan() { assert plans.size() == 1 : "there are cross join"; Plan plan = plans.values().iterator().next(); @@ -166,9 +189,14 @@ public void initStats(CascadesContext context) { for (Group group : context.getMemo().getGroups()) { GroupExpression groupExpression = group.getLogicalExpression(); if (groupExpression.getPlan() instanceof LogicalOlapScan) { + LogicalOlapScan scan = (LogicalOlapScan) groupExpression.getPlan(); Statistics stats = injectRowcount((LogicalOlapScan) groupExpression.getPlan()); - groupExpression.setStatDerived(true); - group.setStatistics(stats); + for (Expression expr : stats.columnStatistics().keySet()) { + SlotReference slot = (SlotReference) expr; + Env.getCurrentEnv().getStatisticsCache().putCache( + new StatisticsCacheKey(scan.getTable().getId(), -1, slot.getName()), + stats.columnStatistics().get(expr)); + } } } } @@ -313,8 +341,8 @@ private Statistics injectRowcount(LogicalOlapScan scanPlan) { for (Slot slot : scanPlan.getOutput()) { slotIdToColumnStats.put(slot, new ColumnStatistic(count, count, null, 1, 0, 0, 0, - count, 1, null, null, true, null, - new Date().toString())); + count, null, null, true, null, + new Date().toString(), null)); } return new Statistics(count, slotIdToColumnStats); } @@ -364,7 +392,7 @@ private Expression makeCondition(int node1, int node2, BitSet bitSet) { return hashConjunts; } - public Set> evaluate(Plan plan) { + public Set> evaluate(Plan plan) { JoinEvaluator evaluator = new JoinEvaluator(rowCounts); Map> res = evaluator.evaluate(plan); int rowCount = 0; @@ -376,11 +404,12 @@ public Set> evaluate(Plan plan) { (slot1, slot2) -> String.CASE_INSENSITIVE_ORDER.compare(slot1.toString(), slot2.toString())) .collect(Collectors.toList()); - Set> tuples = new HashSet<>(); + Set> tuples = new HashSet<>(); + tuples.add(keySet.stream().map(s -> s.toString()).collect(Collectors.toList())); for (int i = 0; i < rowCount; i++) { - List tuple = new ArrayList<>(); + List tuple = new ArrayList<>(); for (Slot key : keySet) { - tuple.add(res.get(key).get(i)); + tuple.add(String.valueOf(res.get(key).get(i))); } tuples.add(tuple); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java index 1955a0d9a3e361..9624c20149828d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java @@ -26,7 +26,6 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.utframe.TestWithFeService; @@ -38,6 +37,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -62,13 +62,7 @@ protected void runBeforeAll() throws Exception { } @Test - public void testCreateAnalysisJob(@Mocked AnalysisTaskScheduler scheduler) throws Exception { - new Expectations() { - { - scheduler.schedule((BaseAnalysisTask) any); - times = 3; - } - }; + public void testCreateAnalysisJob() throws Exception { new MockUp() { @@ -101,7 +95,7 @@ public ConnectContext get() { } @Test - public void testJobExecution(@Mocked AnalysisTaskScheduler scheduler, @Mocked StmtExecutor stmtExecutor) + public void testJobExecution(@Mocked StmtExecutor stmtExecutor) throws Exception { new MockUp() { @@ -120,10 +114,16 @@ public void execUpdate(String sql) throws Exception { public void syncLoadColStats(long tableId, long idxId, String colName) { } }; - new Expectations() { - { - stmtExecutor.execute(); - times = 2; + new MockUp() { + + @Mock + public void execute() throws Exception { + + } + + @Mock + public List executeInternalQuery() { + return new ArrayList<>(); } }; HashMap> colToPartitions = Maps.newHashMap(); @@ -135,8 +135,15 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { .setAnalysisMethod(AnalysisMethod.FULL) .setAnalysisType(AnalysisType.FUNDAMENTALS) .setColToPartitions(colToPartitions) + .setState(AnalysisState.RUNNING) .build(); new OlapAnalysisTask(analysisJobInfo).doExecute(); + new Expectations() { + { + stmtExecutor.execute(); + times = 1; + } + }; } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java index 42f643a137d8b7..196ac8ad9a056f 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java @@ -24,13 +24,12 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.utframe.TestWithFeService; import com.google.common.collect.Maps; +import mockit.Expectations; import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.jupiter.api.Test; import java.util.Collections; @@ -41,8 +40,6 @@ public class AnalysisTaskExecutorTest extends TestWithFeService { - @Mocked - AnalysisTaskScheduler analysisTaskScheduler; @Override protected void runBeforeAll() throws Exception { @@ -71,13 +68,7 @@ public void testExpiredJobCancellation() throws Exception { .build(); OlapAnalysisTask analysisJob = new OlapAnalysisTask(analysisJobInfo); - new MockUp() { - public synchronized BaseAnalysisTask getPendingTasks() { - return analysisJob; - } - }; - - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); BlockingQueue b = Deencapsulation.getField(analysisTaskExecutor, "taskQueue"); AnalysisTaskWrapper analysisTaskWrapper = new AnalysisTaskWrapper(analysisTaskExecutor, analysisJob); Deencapsulation.setField(analysisTaskWrapper, "startTime", 5); @@ -97,7 +88,12 @@ public List executeInternalQuery() { new MockUp() { @Mock - public void execSQL(String sql) throws Exception { + public void execSQLs(List sqls) throws Exception { + } + + @Mock + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + // DO NOTHING } }; @@ -108,7 +104,7 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { } }; - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); HashMap> colToPartitions = Maps.newHashMap(); colToPartitions.put("col1", Collections.singleton("t1")); AnalysisInfo analysisInfo = new AnalysisInfoBuilder().setJobId(0).setTaskId(0) @@ -117,19 +113,20 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { .setAnalysisMode(AnalysisMode.FULL) .setAnalysisMethod(AnalysisMethod.FULL) .setAnalysisType(AnalysisType.FUNDAMENTALS) + .setState(AnalysisState.RUNNING) .setColToPartitions(colToPartitions) .build(); OlapAnalysisTask task = new OlapAnalysisTask(analysisInfo); - new MockUp() { - @Mock - public synchronized BaseAnalysisTask getPendingTasks() { - return task; - } - }; + new MockUp() { @Mock public void updateTaskStatus(AnalysisInfo info, AnalysisState jobState, String message, long time) {} }; - Deencapsulation.invoke(analysisTaskExecutor, "doFetchAndExecute"); + new Expectations() { + { + task.doExecute(); + } + }; + Deencapsulation.invoke(analysisTaskExecutor, "submitTask", task); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java index d6570ecebc5df8..77086723e27153 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java @@ -25,8 +25,10 @@ import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.datasource.CatalogMgr; import org.apache.doris.datasource.HMSExternalCatalog; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.ha.FrontendNodeType; import org.apache.doris.statistics.util.StatisticsUtil; +import org.apache.doris.system.Frontend; +import org.apache.doris.thrift.TUpdateFollowerStatsCacheRequest; import org.apache.doris.utframe.TestWithFeService; import com.google.common.collect.Lists; @@ -90,60 +92,20 @@ public List execStatisticQuery(String sql) { } catch (InterruptedException e) { // ignore } - List colNames = new ArrayList<>(); - colNames.add("count"); - colNames.add("ndv"); - colNames.add("null_count"); - colNames.add("data_size_in_bytes"); - colNames.add("catalog_id"); - colNames.add("db_id"); - colNames.add("idx_id"); - colNames.add("tbl_id"); - colNames.add("col_id"); - colNames.add("min"); - colNames.add("max"); - colNames.add("part_id"); - colNames.add("update_time"); - List primitiveTypes = new ArrayList<>(); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - List values = new ArrayList<>(); - values.add("1"); - values.add("2"); - values.add("3"); - values.add("4"); - values.add("5"); - values.add("-1"); - values.add("6"); - values.add("7"); - values.add("8"); - values.add("9"); - values.add("10"); - values.add(null); - values.add(new Date().toString()); - ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values); - return Arrays.asList(resultRow); + return Arrays.asList(StatsMockUtil.mockResultRow(true)); } }; StatisticsCache statisticsCache = new StatisticsCache(); ColumnStatistic columnStatistic = statisticsCache.getColumnStatistics(-1, -1, 0, "col"); + // load not finished yet, should return unknown Assertions.assertTrue(columnStatistic.isUnKnown); + // wait 1 sec to ensure `execStatisticQuery` is finished as much as possible. Thread.sleep(1000); + // load has finished, return corresponding stats. columnStatistic = statisticsCache.getColumnStatistics(-1, -1, 0, "col"); - Assertions.assertEquals(1, columnStatistic.count); - Assertions.assertEquals(2, columnStatistic.ndv); - Assertions.assertEquals(10, columnStatistic.maxValue); + Assertions.assertEquals(7, columnStatistic.count); + Assertions.assertEquals(8, columnStatistic.ndv); + Assertions.assertEquals(11, columnStatistic.maxValue); } @Test @@ -159,11 +121,10 @@ public Histogram fromResultRow(ResultRow resultRow) { Type dataType = col.getType(); histogramBuilder.setDataType(dataType); + HistData histData = new HistData(resultRow); + histogramBuilder.setSampleRate(histData.sampleRate); - double sampleRate = Double.parseDouble(resultRow.getColumnValue("sample_rate")); - histogramBuilder.setSampleRate(sampleRate); - - String json = resultRow.getColumnValue("buckets"); + String json = histData.buckets; JsonObject jsonObj = JsonParser.parseString(json).getAsJsonObject(); int bucketNum = jsonObj.get("num_buckets").getAsInt(); @@ -202,28 +163,14 @@ public List execStatisticQuery(String sql) { } catch (InterruptedException e) { // ignore } - List colNames = new ArrayList<>(); - colNames.add("catalog_id"); - colNames.add("db_id"); - colNames.add("idx_id"); - colNames.add("tbl_id"); - colNames.add("col_id"); - colNames.add("sample_rate"); - colNames.add("buckets"); - List primitiveTypes = new ArrayList<>(); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); List values = new ArrayList<>(); values.add("1"); values.add("2"); values.add("3"); - values.add("-1"); values.add("4"); + values.add("-1"); + values.add("col"); + values.add(null); values.add("0.2"); String buckets = "{\"num_buckets\":5,\"buckets\":" + "[{\"lower\":\"2022-09-21 17:30:29\",\"upper\":\"2022-09-21 22:30:29\"," @@ -237,7 +184,8 @@ public List execStatisticQuery(String sql) { + "{\"lower\":\"2022-09-25 17:30:29\",\"upper\":\"2022-09-25 22:30:29\"," + "\"count\":9,\"pre_sum\":37,\"ndv\":1}]}"; values.add(buckets); - ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values); + values.add(new Date().toString()); + ResultRow resultRow = new ResultRow(values); return Collections.singletonList(resultRow); } }; @@ -251,10 +199,10 @@ public List execStatisticQuery(String sql) { @Test public void testLoadFromMeta(@Mocked Env env, - @Mocked CatalogMgr mgr, - @Mocked HMSExternalCatalog catalog, - @Mocked HMSExternalDatabase db, - @Mocked HMSExternalTable table) throws Exception { + @Mocked CatalogMgr mgr, + @Mocked HMSExternalCatalog catalog, + @Mocked HMSExternalDatabase db, + @Mocked HMSExternalTable table) throws Exception { new MockUp() { @Mock @@ -290,8 +238,8 @@ public Env getCurrentEnv() { table.getColumnStatistic("col"); result = new ColumnStatistic(1, 2, - null, 3, 4, 5, 6, 7, 8, - null, null, false, null, new Date().toString()); + null, 3, 4, 5, 6, 7, + null, null, false, null, new Date().toString(), null); } }; StatisticsCache statisticsCache = new StatisticsCache(); @@ -306,4 +254,96 @@ public Env getCurrentEnv() { Assertions.assertEquals(6, columnStatistic.minValue); Assertions.assertEquals(7, columnStatistic.maxValue); } + + @Test + public void testSync1() throws Exception { + new MockUp() { + @Mock + public List loadColStats(long tableId, long idxId, String colName) { + List rows = new ArrayList<>(); + rows.add(StatsMockUtil.mockResultRow(true)); + rows.add(StatsMockUtil.mockResultRow(false)); + return rows; + } + + @Mock + public boolean isMaster(Frontend frontend) { + return frontend.getRole().equals(FrontendNodeType.MASTER); + } + }; + new MockUp() { + @Mock + public List getFrontends(FrontendNodeType nodeType) { + Frontend frontend1 = new Frontend(FrontendNodeType.MASTER, + "fe1", "localhost:1111", "localhost", 2222); + Frontend frontend2 = new Frontend(FrontendNodeType.FOLLOWER, + "fe1", "localhost:1112", "localhost", 2223); + List frontends = new ArrayList<>(); + frontends.add(frontend1); + frontends.add(frontend2); + return frontends; + } + }; + + new MockUp() { + @Mock + private void sendStats(Frontend frontend, + TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + // DO NONTHING + } + }; + StatisticsCache statisticsCache = new StatisticsCache(); + statisticsCache.syncLoadColStats(1L, 1L, "any"); + new Expectations() { + { + statisticsCache.sendStats((Frontend) any, (TUpdateFollowerStatsCacheRequest) any); + times = 1; + } + }; + } + + @Test + public void testSync2() throws Exception { + new MockUp() { + @Mock + + public ColumnStatistic fromResultRow(ResultRow row) { + return ColumnStatistic.UNKNOWN; + } + + @Mock + public ColumnStatistic fromResultRow(List row) { + return ColumnStatistic.UNKNOWN; + } + }; + new MockUp() { + @Mock + public List getFrontends(FrontendNodeType nodeType) { + Frontend frontend1 = new Frontend(FrontendNodeType.MASTER, + "fe1", "localhost:1111", "localhost", 2222); + Frontend frontend2 = new Frontend(FrontendNodeType.FOLLOWER, + "fe1", "localhost:1112", "localhost", 2223); + List frontends = new ArrayList<>(); + frontends.add(frontend1); + frontends.add(frontend2); + return frontends; + } + }; + + new MockUp() { + @Mock + private void sendStats(Frontend frontend, + TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + // DO NOTHING + } + }; + StatisticsCache statisticsCache = new StatisticsCache(); + statisticsCache.syncLoadColStats(1L, 1L, "any"); + new Expectations() { + { + statisticsCache.sendStats((Frontend) any, (TUpdateFollowerStatsCacheRequest) any); + times = 0; + } + }; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java index d3d5245a81f850..0660c994a12783 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java @@ -30,7 +30,6 @@ import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.FixMethodOrder; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -43,9 +42,6 @@ @FixMethodOrder(value = MethodSorters.NAME_ASCENDING) public class HistogramTaskTest extends TestWithFeService { - @Mocked - AnalysisTaskScheduler analysisTaskScheduler; - @Override protected void runBeforeAll() throws Exception { createDatabase("histogram_task_test"); @@ -96,7 +92,7 @@ public void test1TaskCreation() throws Exception { @Test public void test2TaskExecution() throws Exception { - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); AnalysisInfo analysisInfo = new AnalysisInfoBuilder() .setJobId(0).setTaskId(0).setCatalogName("internal") .setDbName(SystemInfoService.DEFAULT_CLUSTER + ":" + "histogram_task_test").setTblName("t1") @@ -107,17 +103,11 @@ public void test2TaskExecution() throws Exception { .build(); HistogramTask task = new HistogramTask(analysisInfo); - new MockUp() { - @Mock - public synchronized BaseAnalysisTask getPendingTasks() { - return task; - } - }; new MockUp() { @Mock public void updateTaskStatus(AnalysisInfo info, AnalysisState jobState, String message, long time) {} }; - Deencapsulation.invoke(analysisTaskExecutor, "doFetchAndExecute"); + Deencapsulation.invoke(analysisTaskExecutor, "submitTask", task); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java index 78872a547d4084..a1ff5b13587522 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java @@ -29,7 +29,8 @@ public class StatsDeriveResultTest { public void testUpdateRowCountByLimit() { StatsDeriveResult stats = new StatsDeriveResult(100); ColumnStatistic a = new ColumnStatistic(100, 10, null, 1, 5, 10, - 1, 100, 0.5, null, null, false, null, new Date().toString()); + 1, 100, null, null, false, null, + new Date().toString(), null); Id id = new Id(1); stats.addColumnStats(id, a); StatsDeriveResult res = stats.updateByLimit(0); @@ -42,7 +43,6 @@ public void testUpdateRowCountByLimit() { Assertions.assertEquals(1, resColStats.dataSize); Assertions.assertEquals(1, resColStats.minValue); Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(0, resColStats.selectivity); Assertions.assertEquals(false, resColStats.isUnKnown); res = stats.updateByLimit(1); @@ -53,7 +53,6 @@ public void testUpdateRowCountByLimit() { Assertions.assertEquals(1, resColStats.dataSize); Assertions.assertEquals(1, resColStats.minValue); Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(0.05, resColStats.selectivity); Assertions.assertEquals(false, resColStats.isUnKnown); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java new file mode 100644 index 00000000000000..21035051ff8606 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import java.util.ArrayList; +import java.util.List; + +public class StatsMockUtil { + + public static ResultRow mockResultRow(boolean col) { + List vals = new ArrayList() {{ + add("0"); + add("1"); + add("2"); + add("3"); + add("-1"); + add("5"); + if (col) { + add(null); + } else { + add("6"); + } + add("7"); + add("8"); + add("0"); + add("10"); + add("11"); + add("12"); + add(String.valueOf(System.currentTimeMillis())); + }}; + return new ResultRow(vals); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java deleted file mode 100644 index 8d2518ae406dc5..00000000000000 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java +++ /dev/null @@ -1,119 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics.util; - -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.List; - - -public class InternalQueryResultTest { - private InternalQueryResult queryResult; - private InternalQueryResult.ResultRow resultRow; - - @Before - public void setUp() throws Exception { - List columns = Arrays.asList("c1", "c2", "c3", "c4", "c5"); - List types = Arrays.asList(PrimitiveType.STRING, - PrimitiveType.INT, PrimitiveType.FLOAT, - PrimitiveType.DOUBLE, PrimitiveType.BIGINT); - queryResult = new InternalQueryResult(); - List values = Arrays.asList("s1", "1000", "0.1", "0.0001", "1000000"); - resultRow = new ResultRow(columns, types, values); - } - - @Test - public void testGetColumnIndex() { - Assert.assertEquals(0, resultRow.getColumnIndex("c1")); - Assert.assertEquals(1, resultRow.getColumnIndex("c2")); - Assert.assertEquals(2, resultRow.getColumnIndex("c3")); - Assert.assertEquals(3, resultRow.getColumnIndex("c4")); - Assert.assertEquals(4, resultRow.getColumnIndex("c5")); - } - - @Test - public void testGetColumnName() throws Exception { - Assert.assertEquals("c1", resultRow.getColumnName(0)); - Assert.assertEquals("c2", resultRow.getColumnName(1)); - Assert.assertEquals("c3", resultRow.getColumnName(2)); - Assert.assertEquals("c4", resultRow.getColumnName(3)); - Assert.assertEquals("c5", resultRow.getColumnName(4)); - } - - @Test - public void testGetColumnTypeWithIndex() { - try { - Assert.assertEquals(PrimitiveType.STRING, resultRow.getColumnType(0)); - Assert.assertEquals(PrimitiveType.INT, resultRow.getColumnType(1)); - Assert.assertEquals(PrimitiveType.FLOAT, resultRow.getColumnType(2)); - Assert.assertEquals(PrimitiveType.DOUBLE, resultRow.getColumnType(3)); - Assert.assertEquals(PrimitiveType.BIGINT, resultRow.getColumnType(4)); - } catch (DdlException e) { - e.printStackTrace(); - Assert.fail(); - } - } - - @Test - public void testGetColumnTypeWithName() { - try { - Assert.assertEquals(PrimitiveType.STRING, resultRow.getColumnType("c1")); - Assert.assertEquals(PrimitiveType.INT, resultRow.getColumnType("c2")); - Assert.assertEquals(PrimitiveType.FLOAT, resultRow.getColumnType("c3")); - Assert.assertEquals(PrimitiveType.DOUBLE, resultRow.getColumnType("c4")); - Assert.assertEquals(PrimitiveType.BIGINT, resultRow.getColumnType("c5")); - } catch (DdlException e) { - e.printStackTrace(); - Assert.fail(); - } - } - - @Test - public void testGetColumnValueWithIndex() throws Exception { - Assert.assertEquals("s1", resultRow.getColumnValue(0).toString()); - Assert.assertEquals(1000, Integer.parseInt((String) resultRow.getColumnValue(1))); - Assert.assertEquals(0.1f, Float.parseFloat((String) resultRow.getColumnValue(2)), 0.0001); - Assert.assertEquals(0.0001, Double.parseDouble((String) resultRow.getColumnValue(3)), 0.0001); - Assert.assertEquals(1000000, Long.parseLong((String) resultRow.getColumnValue(4))); - } - - @Test - public void testGetColumnValueWithName() throws Exception { - Assert.assertEquals("s1", resultRow.getColumnValue(0).toString()); - Assert.assertEquals(1000, Integer.parseInt((String) resultRow.getColumnValue(1))); - Assert.assertEquals(0.1f, Float.parseFloat((String) resultRow.getColumnValue(2)), 0.0001); - Assert.assertEquals(0.0001, Double.parseDouble((String) resultRow.getColumnValue(3)), 0.0001); - Assert.assertEquals(1000000, Long.parseLong((String) resultRow.getColumnValue(4))); - } - - @Test - public void testGetTypeValue() throws Exception { - Assert.assertEquals("s1", resultRow.getString(0)); - Assert.assertEquals(1000, resultRow.getInt(1)); - Assert.assertEquals(0.1f, resultRow.getFloat(2), 0.0001); - Assert.assertEquals(0.0001, resultRow.getDouble(3), 0.0001); - Assert.assertEquals(1000000, resultRow.getLong(4)); - } -} diff --git a/gensrc/thrift/FrontendService.thrift b/gensrc/thrift/FrontendService.thrift index 5a3cf88db8f6ea..37633aeb83658b 100644 --- a/gensrc/thrift/FrontendService.thrift +++ b/gensrc/thrift/FrontendService.thrift @@ -1091,7 +1091,7 @@ struct TGetBinlogLagResult { struct TUpdateFollowerStatsCacheRequest { 1: optional string key; - 2: optional string colStats; + 2: list statsRows; } service FrontendService { diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 3fa926ab06d442..b168230b2d56e2 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -1,3 +1,5 @@ +import java.util.stream.Collectors + // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -162,25 +164,25 @@ suite("test_analyze") { """ a_result_3 = sql """ - ANALYZE DATABASE ${db} WITH SAMPLE PERCENT 5 WITH AUTO + ANALYZE DATABASE ${db} WITH SAMPLE PERCENT 5 """ show_result = sql """ SHOW ANALYZE """ - def contains_expected_table = {r -> - for(int i = 0; i < r.size; i++) { - if (r[i][3] == "${tbl}" ) { + def contains_expected_table = { r -> + for (int i = 0; i < r.size; i++) { + if (r[i][3] == "${tbl}") { return true } } return false } - def stats_job_removed = {r, id -> - for(int i = 0; i < r.size; i++) { - if (r[i][0] == id ) { + def stats_job_removed = { r, id -> + for (int i = 0; i < r.size; i++) { + if (r[i][0] == id) { return false } } @@ -190,14 +192,14 @@ suite("test_analyze") { assert contains_expected_table(show_result) sql """ - DROP ANALYZE JOB ${a_result_3[0][4]} + DROP ANALYZE JOB ${a_result_3[0][0]} """ show_result = sql """ SHOW ANALYZE """ - assert stats_job_removed(show_result, a_result_3[0][4]) + assert stats_job_removed(show_result, a_result_3[0][0]) sql """ ANALYZE DATABASE ${db} WITH SAMPLE ROWS 5 WITH PERIOD 100000 @@ -238,8 +240,8 @@ suite("test_analyze") { SHOW COLUMN CACHED STATS analyze_partitioned_tbl_test(col1) """ - def expected_result = { r-> - for(int i = 0; i < r.size; i++) { + def expected_result = { r -> + for (int i = 0; i < r.size; i++) { if ((int) Double.parseDouble(r[i][1]) == 6) { return true } else { @@ -888,7 +890,7 @@ PARTITION `p599` VALUES IN (599) SHOW COLUMN CACHED STATS test_600_partition_table_analyze(id); """ - def expected_col_stats = { r, expected_value, idx -> + def expected_col_stats = { r, expected_value, idx -> return (int) Double.parseDouble(r[0][idx]) == expected_value } @@ -1048,7 +1050,7 @@ PARTITION `p599` VALUES IN (599) sql """ DROP TABLE IF EXISTS two_thousand_partition_table_test """ - + // check analyze table with thousand partition sql """ CREATE TABLE two_thousand_partition_table_test (col1 int(11451) not null) DUPLICATE KEY(col1) @@ -1067,5 +1069,49 @@ PARTITION `p599` VALUES IN (599) ANALYZE TABLE two_thousand_partition_table_test WITH SYNC; """ -} + // meta check + sql """ + CREATE TABLE `test_meta_management` ( + `col1` varchar(11451) NOT NULL, + `col2` int(11) NOT NULL, + `col3` int(11) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`col1`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`col1`) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """insert into test_meta_management values(1, 2, 3);""" + sql """insert into test_meta_management values(4, 5, 6);""" + sql """insert into test_meta_management values(7, 1, 9);""" + sql """insert into test_meta_management values(3, 8, 2);""" + sql """insert into test_meta_management values(5, 2, 1);""" + sql """insert into test_meta_management values(41, 2, 3)""" + sql """ANALYZE TABLE test_meta_management WITH SYNC""" + sql """DROP STATS test_meta_management(col1)""" + + def afterDropped = sql """SHOW TABLE STATS test_meta_management""" + def convert_col_list_str_to_java_collection = { cols -> + if (cols.startsWith("[") && cols.endsWith("]")) { + cols = cols.substring(1, cols.length() - 1); + } + return Arrays.stream(cols.split(",")).map(String::trim).collect(Collectors.toList()) + } + + def check_column = { r, expected -> + expected_result = convert_col_list_str_to_java_collection(expected) + actual_result = convert_col_list_str_to_java_collection(r[0][4]) + System.out.println(expected_result) + System.out.println(actual_result) + return expected_result.containsAll(actual_result) && actual_result.containsAll(expected_result) + } + assert check_column(afterDropped, "[col2, col3]") + sql """ANALYZE TABLE test_meta_management WITH SYNC""" + afterDropped = sql """SHOW TABLE STATS test_meta_management""" + assert check_column(afterDropped, "[col1, col2, col3]") + +}