From 5ff6bc19538357c2e72d4ad667f31258754c19a3 Mon Sep 17 00:00:00 2001
From: Yuanjia Zhang <zhangyuanjia@pingcap.com>
Date: Mon, 13 Jul 2020 15:20:49 +0800
Subject: [PATCH 1/5] update a heuristic rule

---
 statistics/histogram.go        |  8 ++++----
 statistics/selectivity_test.go |  6 +++---
 statistics/table.go            | 23 +++++++++++++++++------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/statistics/histogram.go b/statistics/histogram.go
index d0ca7c09c58f9..e5f56d4acf633 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -764,7 +764,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 		return 0.0, nil
 	}
 	if c.NDV > 0 && c.outOfRange(val) {
-		return float64(modifyCount) / float64(c.NDV), nil
+		return outOfRangeEQSelectivity(c.NDV, modifyCount) * c.TotalRowCount(), nil
 	}
 	if c.CMSketch != nil {
 		count, err := c.CMSketch.queryValue(sc, val)
@@ -829,7 +829,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		// The interval case.
 		cnt := c.BetweenRowCount(lowVal, highVal)
 		if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
-			cnt += float64(modifyCount) / outOfRangeBetweenRate
+			cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * c.TotalRowCount()
 		}
 		// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
 		// Note that, `cnt` does not include null values, we need specially handle cases
@@ -891,7 +891,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
 	}
 	val := types.NewBytesDatum(b)
 	if idx.NDV > 0 && idx.outOfRange(val) {
-		return float64(modifyCount) / (float64(idx.NDV)), nil
+		return outOfRangeEQSelectivity(idx.NDV, modifyCount) * idx.TotalRowCount(), nil
 	}
 	if idx.CMSketch != nil {
 		return float64(idx.CMSketch.QueryBytes(b)), nil
@@ -943,7 +943,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		totalCount += idx.BetweenRowCount(l, r)
 		lowIsNull := bytes.Equal(lb, nullKeyBytes)
 		if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
-			totalCount += float64(modifyCount) / outOfRangeBetweenRate
+			totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * idx.TotalRowCount()
 		}
 		if isSingleCol && lowIsNull {
 			totalCount += float64(idx.NullCount)
diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
index 8a4e402f9c7f2..a2e32dc391cf0 100644
--- a/statistics/selectivity_test.go
+++ b/statistics/selectivity_test.go
@@ -396,15 +396,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
 	colID := table.Meta().Columns[0].ID
 	count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 2.0)
+	c.Assert(count, Equals, 0.2)
 
 	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 4.2)
+	c.Assert(count, Equals, 2.4000000000000004)
 
 	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 4.2)
+	c.Assert(count, Equals, 2.4000000000000004)
 
 	idxID := table.Meta().Indices[0].ID
 	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
diff --git a/statistics/table.go b/statistics/table.go
index d48a8206282b1..63335ce5b091a 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -392,6 +392,21 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
 	return false
 }
 
+func outOfRangeEQSelectivity(ndv, modifyRows int64) float64 {
+	// It must be 0 since the histogram contains the whole data if modifyRows is 0.
+	if modifyRows == 0 {
+		return 0
+	}
+	// We simply set its selectivity to 1/NDV, and the magic number outOfRangeBetweenRate
+	// is used to avoid wrong selectivity caused by small NDV.
+	if ndv < outOfRangeBetweenRate {
+		ndv = outOfRangeBetweenRate
+	}
+	// TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
+	// Please see https://github.com/pingcap/tidb/issues/18461 for more details.
+	return 1 / float64(ndv)
+}
+
 // getEqualCondSelectivity gets the selectivity of the equal conditions.
 func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedColsLen int) float64 {
 	coverAll := len(idx.Info.Columns) == usedColsLen
@@ -404,8 +419,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 		// When the value is out of range, we could not found this value in the CM Sketch,
 		// so we use heuristic methods to estimate the selectivity.
 		if idx.NDV > 0 && coverAll {
-			// for equality queries
-			return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
+			return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount)
 		}
 		// The equal condition only uses prefix columns of the index.
 		colIDs := coll.Idx2ColumnIDs[idx.ID]
@@ -416,10 +430,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 			}
 			ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
 		}
-		if ndv > 0 {
-			return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount()
-		}
-		return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
+		return outOfRangeEQSelectivity(ndv, coll.ModifyCount)
 	}
 	return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
 }

From 394f5f1f44f8a6c10cb6f8c4eb2677a0b25748e5 Mon Sep 17 00:00:00 2001
From: Yuanjia Zhang <zhangyuanjia@pingcap.com>
Date: Tue, 14 Jul 2020 16:00:48 +0800
Subject: [PATCH 2/5] fix CI

---
 planner/core/cbo_test.go                     |  4 ++--
 planner/core/testdata/analyze_suite_out.json | 13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/planner/core/cbo_test.go b/planner/core/cbo_test.go
index ac359deacc7ad..781952adf3e05 100644
--- a/planner/core/cbo_test.go
+++ b/planner/core/cbo_test.go
@@ -414,8 +414,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
 	c.Assert(h.Update(dom.InfoSchema()), IsNil)
 	statistics.RatioOfPseudoEstimate.Store(10.0)
 	testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
-		"TableReader_7 35.91 root  data:Selection_6",
-		"└─Selection_6 35.91 cop[tikv]  le(test.t.a, 5), le(test.t.b, 5)",
+		"TableReader_7 29.77 root  data:Selection_6",
+		"└─Selection_6 29.77 cop[tikv]  le(test.t.a, 5), le(test.t.b, 5)",
 		"  └─TableFullScan_5 80.00 cop[tikv] table:t keep order:false",
 	))
 	statistics.RatioOfPseudoEstimate.Store(0.7)
diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json
index 38ee91b1a89ac..0cfa239b2f3e6 100644
--- a/planner/core/testdata/analyze_suite_out.json
+++ b/planner/core/testdata/analyze_suite_out.json
@@ -347,17 +347,18 @@
       {
         "SQL": "explain select * from t where a = 7639902",
         "Plan": [
-          "IndexReader_6 2.03 root  index:IndexRangeScan_5",
-          "└─IndexRangeScan_5 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
+          "IndexReader_6 6.68 root  index:IndexRangeScan_5",
+          "└─IndexRangeScan_5 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
         ]
       },
       {
         "SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6",
         "Plan": [
-          "Projection_7 2.03 root  test.t.c, test.t.b",
-          "└─TopN_10 2.03 root  test.t.b, offset:0, count:6",
-          "  └─IndexReader_18 2.03 root  index:IndexRangeScan_17",
-          "    └─IndexRangeScan_17 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
+          "Projection_7 6.00 root  test.t.c, test.t.b",
+          "└─TopN_8 6.00 root  test.t.b, offset:0, count:6",
+          "  └─IndexReader_16 6.00 root  index:TopN_15",
+          "    └─TopN_15 6.00 cop[tikv]  test.t.b, offset:0, count:6",
+          "      └─IndexRangeScan_14 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
         ]
       }
     ]

From b3b8bc7479db539c60f8cf9eb4d1aa42260b20bf Mon Sep 17 00:00:00 2001
From: Yuanjia Zhang <zhangyuanjia@pingcap.com>
Date: Tue, 14 Jul 2020 16:07:51 +0800
Subject: [PATCH 3/5] fix CI

---
 cmd/explaintest/r/explain_union_scan.result | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result
index 45f9ef1e4ab1a..087f0d62efa21 100644
--- a/cmd/explaintest/r/explain_union_scan.result
+++ b/cmd/explaintest/r/explain_union_scan.result
@@ -16,16 +16,16 @@ Limit_20	10.00	root		offset:0, count:10
 └─HashJoin_22	10.00	root		left outer join, equal:[eq(test.city.province_id, test.city.province_id)]
   ├─Limit_25(Build)	10.00	root		offset:0, count:10
   │ └─IndexJoin_38	10.00	root		inner join, inner:UnionScan_37, outer key:test.city.id, inner key:test.city.id
-  │   ├─UnionScan_47(Build)	10.33	root		
-  │   │ └─TableReader_49	10.33	root		data:TableFullScan_48
-  │   │   └─TableFullScan_48	10.33	cop[tikv]	table:t2	keep order:false
-  │   └─UnionScan_37(Probe)	0.97	root		gt(test.city.province_id, 1), lt(test.city.province_id, 100)
-  │     └─IndexLookUp_36	0.97	root		
+  │   ├─UnionScan_47(Build)	10.00	root		
+  │   │ └─TableReader_49	10.00	root		data:TableFullScan_48
+  │   │   └─TableFullScan_48	10.00	cop[tikv]	table:t2	keep order:false
+  │   └─UnionScan_37(Probe)	1.00	root		gt(test.city.province_id, 1), lt(test.city.province_id, 100)
+  │     └─IndexLookUp_36	1.00	root		
   │       ├─IndexRangeScan_33(Build)	1.00	cop[tikv]	table:t1, index:PRIMARY(id)	range: decided by [eq(test.city.id, test.city.id)], keep order:false
-  │       └─Selection_35(Probe)	0.97	cop[tikv]		gt(test.city.province_id, 1), lt(test.city.province_id, 100)
+  │       └─Selection_35(Probe)	1.00	cop[tikv]		gt(test.city.province_id, 1), lt(test.city.province_id, 100)
   │         └─TableRowIDScan_34	1.00	cop[tikv]	table:t1	keep order:false
-  └─UnionScan_57(Probe)	519304.44	root		gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
-    └─TableReader_60	519304.44	root		data:Selection_59
-      └─Selection_59	519304.44	cop[tikv]		gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
+  └─UnionScan_57(Probe)	536284.00	root		gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
+    └─TableReader_60	536284.00	root		data:Selection_59
+      └─Selection_59	536284.00	cop[tikv]		gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
         └─TableFullScan_58	536284.00	cop[tikv]	table:t3	keep order:false
 commit;

From 413462a2d3caac50d1047e22e7cc014fdc8ce304 Mon Sep 17 00:00:00 2001
From: Yuanjia Zhang <zhangyuanjia@pingcap.com>
Date: Tue, 14 Jul 2020 16:16:39 +0800
Subject: [PATCH 4/5] fix CI

---
 statistics/handle/update_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go
index 742d7f7f80fbe..06196cb9f226f 100644
--- a/statistics/handle/update_test.go
+++ b/statistics/handle/update_test.go
@@ -1524,8 +1524,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
 			sql: "select * from t where a = 2 and b > 10",
 			hist: "column:2 ndv:20 totColSize:20\n" +
 				"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
-				"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
-				"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
+				"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
+				"num: 5 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
 			rangeID: tblInfo.Columns[1].ID,
 			idxID:   tblInfo.Indices[0].ID,
 			eqCount: 3,

From 9eecd96d6b42bb25d9e3fb72ca92bc7e92c1b91b Mon Sep 17 00:00:00 2001
From: Yuanjia Zhang <zhangyuanjia@pingcap.com>
Date: Mon, 27 Jul 2020 15:12:50 +0800
Subject: [PATCH 5/5] address comments

---
 statistics/histogram.go        |  8 ++++----
 statistics/selectivity_test.go | 28 ++++++++++++++++++++++++++++
 statistics/table.go            | 25 ++++++++++++++-----------
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/statistics/histogram.go b/statistics/histogram.go
index e5f56d4acf633..07d040db82f76 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -764,7 +764,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 		return 0.0, nil
 	}
 	if c.NDV > 0 && c.outOfRange(val) {
-		return outOfRangeEQSelectivity(c.NDV, modifyCount) * c.TotalRowCount(), nil
+		return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
 	}
 	if c.CMSketch != nil {
 		count, err := c.CMSketch.queryValue(sc, val)
@@ -829,7 +829,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		// The interval case.
 		cnt := c.BetweenRowCount(lowVal, highVal)
 		if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
-			cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * c.TotalRowCount()
+			cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
 		}
 		// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
 		// Note that, `cnt` does not include null values, we need specially handle cases
@@ -891,7 +891,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
 	}
 	val := types.NewBytesDatum(b)
 	if idx.NDV > 0 && idx.outOfRange(val) {
-		return outOfRangeEQSelectivity(idx.NDV, modifyCount) * idx.TotalRowCount(), nil
+		return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
 	}
 	if idx.CMSketch != nil {
 		return float64(idx.CMSketch.QueryBytes(b)), nil
@@ -943,7 +943,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		totalCount += idx.BetweenRowCount(l, r)
 		lowIsNull := bytes.Equal(lb, nullKeyBytes)
 		if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
-			totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * idx.TotalRowCount()
+			totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount()
 		}
 		if isSingleCol && lowIsNull {
 			totalCount += float64(idx.NullCount)
diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
index a2e32dc391cf0..46f9bbd6096c1 100644
--- a/statistics/selectivity_test.go
+++ b/statistics/selectivity_test.go
@@ -370,6 +370,34 @@ func getRange(start, end int64) []*ranger.Range {
 	return []*ranger.Range{ran}
 }
 
+func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) {
+	defer cleanEnv(c, s.store, s.do)
+	testKit := testkit.NewTestKit(c, s.store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int)")
+	for i := 0; i < 1000; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249
+	}
+	testKit.MustExec("analyze table t")
+
+	h := s.do.StatsHandle()
+	table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	statsTbl := h.GetTableStats(table.Meta())
+	sc := &stmtctx.StatementContext{}
+	col := statsTbl.Columns[table.Meta().Columns[0].ID]
+	count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false)
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, float64(0))
+
+	for i := 0; i < 8; i++ {
+		count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false)
+		c.Assert(err, IsNil)
+		c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt
+	}
+}
+
 func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
 	defer cleanEnv(c, s.store, s.do)
 	testKit := testkit.NewTestKit(c, s.store)
diff --git a/statistics/table.go b/statistics/table.go
index 63335ce5b091a..518da999b6695 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -392,19 +392,22 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
 	return false
 }
 
-func outOfRangeEQSelectivity(ndv, modifyRows int64) float64 {
-	// It must be 0 since the histogram contains the whole data if modifyRows is 0.
+// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
+// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
+// and has the same distribution with analyzed rows, which means each unique value should have the
+// same number of rows(Tot/NDV) of it.
+func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
 	if modifyRows == 0 {
-		return 0
+		return 0 // it must be 0 since the histogram contains the whole data
 	}
-	// We simply set its selectivity to 1/NDV, and the magic number outOfRangeBetweenRate
-	// is used to avoid wrong selectivity caused by small NDV.
 	if ndv < outOfRangeBetweenRate {
-		ndv = outOfRangeBetweenRate
+		ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
 	}
-	// TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
-	// Please see https://github.com/pingcap/tidb/issues/18461 for more details.
-	return 1 / float64(ndv)
+	selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
+	if selectivity*float64(totalRows) > float64(modifyRows) {
+		selectivity = float64(modifyRows) / float64(totalRows)
+	}
+	return selectivity
 }
 
 // getEqualCondSelectivity gets the selectivity of the equal conditions.
@@ -419,7 +422,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 		// When the value is out of range, we could not found this value in the CM Sketch,
 		// so we use heuristic methods to estimate the selectivity.
 		if idx.NDV > 0 && coverAll {
-			return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount)
+			return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
 		}
 		// The equal condition only uses prefix columns of the index.
 		colIDs := coll.Idx2ColumnIDs[idx.ID]
@@ -430,7 +433,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 			}
 			ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
 		}
-		return outOfRangeEQSelectivity(ndv, coll.ModifyCount)
+		return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
 	}
 	return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
 }