From b6ff4ad4b14e4802bde9713c2165cf1a8f373347 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 6 Apr 2017 10:15:18 +0800 Subject: [PATCH] statistics: add bucket structure. (#2993) --- statistics/builder.go | 95 ++++++++++++---------- statistics/column.go | 144 ++++++++++++++++------------------ statistics/statistics_test.go | 21 ++++- statistics/statscache_test.go | 8 +- 4 files changed, 143 insertions(+), 125 deletions(-) diff --git a/statistics/builder.go b/statistics/builder.go index b8ef4ab518803..f58296c8a003a 100644 --- a/statistics/builder.go +++ b/statistics/builder.go @@ -132,7 +132,7 @@ func (b *Builder) NewTable() (*Table, error) { if len(b.TblInfo.Indices[offset].Columns) == 1 { for j, col := range b.TblInfo.Columns { if col.Name.L == b.TblInfo.Indices[offset].Columns[0].Name.L && t.Columns[j] == nil { - t.Columns[j], err = copyFromIndexColumns(t.Indices[offset], col.ID, b.NumBuckets) + t.Columns[j], err = copyFromIndexColumns(t.Indices[offset], col.ID) if err != nil { return nil, errors.Trace(err) } @@ -165,9 +165,7 @@ func (t *Table) build4SortedColumn(sc *variable.StatementContext, offset int, re col := &Column{ ID: id, NDV: 0, - Numbers: make([]int64, 1, bucketCount), - Values: make([]types.Datum, 1, bucketCount), - Repeats: make([]int64, 1, bucketCount), + Buckets: make([]bucket, 1, bucketCount), } var valuesPerBucket, lastNumber, bucketIdx int64 = 1, 0, 0 count := int64(0) @@ -189,7 +187,7 @@ func (t *Table) build4SortedColumn(sc *variable.StatementContext, offset int, re } data = types.NewBytesDatum(bytes) } - cmp, err := col.Values[bucketIdx].CompareDatum(sc, data) + cmp, err := col.Buckets[bucketIdx].Value.CompareDatum(sc, data) if err != nil { return errors.Trace(err) } @@ -198,13 +196,13 @@ func (t *Table) build4SortedColumn(sc *variable.StatementContext, offset int, re // The new item has the same value as current bucket value, to ensure that // a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds // valuesPerBucket. - col.Numbers[bucketIdx]++ - col.Repeats[bucketIdx]++ - } else if col.Numbers[bucketIdx]+1-lastNumber <= valuesPerBucket { + col.Buckets[bucketIdx].Count++ + col.Buckets[bucketIdx].Repeats++ + } else if col.Buckets[bucketIdx].Count+1-lastNumber <= valuesPerBucket { // The bucket still have room to store a new item, update the bucket. - col.Numbers[bucketIdx]++ - col.Values[bucketIdx] = data - col.Repeats[bucketIdx] = 0 + col.Buckets[bucketIdx].Count++ + col.Buckets[bucketIdx].Value = data + col.Buckets[bucketIdx].Repeats = 1 col.NDV++ } else { // All buckets are full, we should merge buckets. @@ -215,20 +213,22 @@ func (t *Table) build4SortedColumn(sc *variable.StatementContext, offset int, re if bucketIdx == 0 { lastNumber = 0 } else { - lastNumber = col.Numbers[bucketIdx-1] + lastNumber = col.Buckets[bucketIdx-1].Count } } // We may merge buckets, so we should check it again. - if col.Numbers[bucketIdx]+1-lastNumber <= valuesPerBucket { - col.Numbers[bucketIdx]++ - col.Values[bucketIdx] = data - col.Repeats[bucketIdx] = 0 + if col.Buckets[bucketIdx].Count+1-lastNumber <= valuesPerBucket { + col.Buckets[bucketIdx].Count++ + col.Buckets[bucketIdx].Value = data + col.Buckets[bucketIdx].Repeats = 1 } else { - lastNumber = col.Numbers[bucketIdx] + lastNumber = col.Buckets[bucketIdx].Count bucketIdx++ - col.Numbers = append(col.Numbers, lastNumber+1) - col.Values = append(col.Values, data) - col.Repeats = append(col.Repeats, 0) + col.Buckets = append(col.Buckets, bucket{ + Count: lastNumber + 1, + Value: data, + Repeats: 1, + }) } col.NDV++ } @@ -252,54 +252,63 @@ func (t *Table) buildColumn(sc *variable.StatementContext, offset int, ndv int64 col := &Column{ ID: ci.ID, NDV: ndv, - Numbers: make([]int64, 1, bucketCount), - Values: make([]types.Datum, 1, bucketCount), - Repeats: make([]int64, 1, bucketCount), + Buckets: make([]bucket, 1, bucketCount), } valuesPerBucket := t.Count/bucketCount + 1 // As we use samples to build the histogram, the bucket number and repeat should multiply a factor. sampleFactor := t.Count / int64(len(samples)) + ndvFactor := t.Count / ndv + if ndvFactor > sampleFactor { + ndvFactor = sampleFactor + } bucketIdx := 0 - var lastNumber int64 + var lastCount int64 for i := int64(0); i < int64(len(samples)); i++ { - cmp, err := col.Values[bucketIdx].CompareDatum(sc, samples[i]) + cmp, err := col.Buckets[bucketIdx].Value.CompareDatum(sc, samples[i]) if err != nil { return errors.Trace(err) } + totalCount := (i + 1) * sampleFactor if cmp == 0 { // The new item has the same value as current bucket value, to ensure that // a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds // valuesPerBucket. - col.Numbers[bucketIdx] = i * sampleFactor - col.Repeats[bucketIdx] += sampleFactor - } else if i*sampleFactor-lastNumber <= valuesPerBucket { + col.Buckets[bucketIdx].Count = totalCount + if col.Buckets[bucketIdx].Repeats == ndvFactor { + col.Buckets[bucketIdx].Repeats = 2 * sampleFactor + } else { + col.Buckets[bucketIdx].Repeats += sampleFactor + } + } else if totalCount-lastCount <= valuesPerBucket { + // TODO: Making sampleFactor as float may be better. // The bucket still have room to store a new item, update the bucket. - col.Numbers[bucketIdx] = i * sampleFactor - col.Values[bucketIdx] = samples[i] - col.Repeats[bucketIdx] = 0 + col.Buckets[bucketIdx].Count = totalCount + col.Buckets[bucketIdx].Value = samples[i] + col.Buckets[bucketIdx].Repeats = ndvFactor } else { + lastCount = col.Buckets[bucketIdx].Count // The bucket is full, store the item in the next bucket. - lastNumber = col.Numbers[bucketIdx] bucketIdx++ - col.Numbers = append(col.Numbers, i*sampleFactor) - col.Values = append(col.Values, samples[i]) - col.Repeats = append(col.Repeats, 0) + col.Buckets = append(col.Buckets, bucket{ + Count: totalCount, + Value: samples[i], + Repeats: ndvFactor, + }) } } t.Columns[offset] = col return nil } -func copyFromIndexColumns(ind *Column, id, numBuckets int64) (*Column, error) { +func copyFromIndexColumns(ind *Column, id int64) (*Column, error) { col := &Column{ ID: id, NDV: ind.NDV, - Numbers: ind.Numbers, - Values: make([]types.Datum, 0, numBuckets), - Repeats: ind.Repeats, + Buckets: make([]bucket, 0, len(ind.Buckets)), } - for _, val := range ind.Values { + for _, b := range ind.Buckets { + val := b.Value if val.GetBytes() == nil { break } @@ -307,7 +316,11 @@ func copyFromIndexColumns(ind *Column, id, numBuckets int64) (*Column, error) { if err != nil { return nil, errors.Trace(err) } - col.Values = append(col.Values, data[0]) + col.Buckets = append(col.Buckets, bucket{ + Count: b.Count, + Value: data[0], + Repeats: b.Repeats, + }) } return col, nil } diff --git a/statistics/column.go b/statistics/column.go index ea297dc8e485c..1023595f4a778 100644 --- a/statistics/column.go +++ b/statistics/column.go @@ -31,19 +31,22 @@ type Column struct { ID int64 // Column ID. NDV int64 // Number of distinct values. - // Histogram elements. - // - // A bucket number is the number of items stored in all previous buckets and the current bucket. - // bucket numbers are always in increasing order. - // - // A bucket value is the greatest item value stored in the bucket. - // - // Repeat is the number of repeats of the bucket value, it can be used to find popular values. - // - // TODO: We could have make a bucket struct contains number, value and repeats. - Numbers []int64 - Values []types.Datum - Repeats []int64 + Buckets []bucket +} + +// bucket is an element of histogram. +// +// A bucket count is the number of items stored in all previous buckets and the current bucket. +// bucket numbers are always in increasing order. +// +// A bucket value is the greatest item value stored in the bucket. +// +// Repeat is the number of repeats of the bucket value, it can be used to find popular values. +// +type bucket struct { + Count int64 + Value types.Datum + Repeats int64 } func (c *Column) saveToStorage(ctx context.Context, tableID int64, isIndex int) error { @@ -52,18 +55,18 @@ func (c *Column) saveToStorage(ctx context.Context, tableID int64, isIndex int) if err != nil { return errors.Trace(err) } - for i := 0; i < len(c.Numbers); i++ { + for i, bucket := range c.Buckets { var count int64 if i == 0 { - count = c.Numbers[i] + count = bucket.Count } else { - count = c.Numbers[i] - c.Numbers[i-1] + count = bucket.Count - c.Buckets[i-1].Count } - val, err := c.Values[i].ConvertTo(ctx.GetSessionVars().StmtCtx, types.NewFieldType(mysql.TypeBlob)) + val, err := bucket.Value.ConvertTo(ctx.GetSessionVars().StmtCtx, types.NewFieldType(mysql.TypeBlob)) if err != nil { return errors.Trace(err) } - insertSQL = fmt.Sprintf("insert into mysql.stats_buckets values(%d, %d, %d, %d, %d, %d, X'%X')", tableID, isIndex, c.ID, i, count, c.Repeats[i], val.GetBytes()) + insertSQL = fmt.Sprintf("insert into mysql.stats_buckets values(%d, %d, %d, %d, %d, %d, X'%X')", tableID, isIndex, c.ID, i, count, bucket.Repeats, val.GetBytes()) _, err = ctx.(sqlexec.SQLExecutor).Execute(insertSQL) if err != nil { return errors.Trace(err) @@ -82,9 +85,7 @@ func colStatsFromStorage(ctx context.Context, tableID int64, colID int64, tp *ty colStats := &Column{ ID: colID, NDV: distinct, - Numbers: make([]int64, bucketSize), - Repeats: make([]int64, bucketSize), - Values: make([]types.Datum, bucketSize), + Buckets: make([]bucket, bucketSize), } for i := 0; i < bucketSize; i++ { bucketID := rows[i].Data[0].GetInt64() @@ -99,99 +100,93 @@ func colStatsFromStorage(ctx context.Context, tableID int64, colID int64, tp *ty return nil, errors.Trace(err) } } - colStats.Numbers[bucketID] = count - colStats.Repeats[bucketID] = repeats - colStats.Values[bucketID] = value + colStats.Buckets[bucketID] = bucket{ + Count: count, + Value: value, + Repeats: repeats, + } } for i := 1; i < bucketSize; i++ { - colStats.Numbers[i] += colStats.Numbers[i-1] + colStats.Buckets[i].Count += colStats.Buckets[i-1].Count } return colStats, nil } func (c *Column) String() string { - strs := make([]string, 0, len(c.Numbers)+1) + strs := make([]string, 0, len(c.Buckets)+1) strs = append(strs, fmt.Sprintf("column:%d ndv:%d", c.ID, c.NDV)) - for i := range c.Numbers { - strVal, _ := c.Values[i].ToString() - strs = append(strs, fmt.Sprintf("num: %d\tvalue: %s\trepeats: %d", c.Numbers[i], strVal, c.Repeats[i])) + for _, bucket := range c.Buckets { + strVal, _ := bucket.Value.ToString() + strs = append(strs, fmt.Sprintf("num: %d\tvalue: %s\trepeats: %d", bucket.Count, strVal, bucket.Repeats)) } return strings.Join(strs, "\n") } // EqualRowCount estimates the row count where the column equals to value. func (c *Column) EqualRowCount(sc *variable.StatementContext, value types.Datum) (int64, error) { - if len(c.Numbers) == 0 { + if len(c.Buckets) == 0 { return pseudoRowCount / pseudoEqualRate, nil } - index, match, err := c.search(sc, value) + index, match, err := c.lowerBound(sc, value) if err != nil { return 0, errors.Trace(err) } - if index == len(c.Numbers) { - return c.Numbers[index-1] + 1, nil + if index == len(c.Buckets) { + return 0, nil } if match { - return c.Repeats[index] + 1, nil + return c.Buckets[index].Repeats, nil } - totalCount := c.Numbers[len(c.Numbers)-1] + 1 - return totalCount / c.NDV, nil + return c.totalRowCount() / c.NDV, nil } // GreaterRowCount estimates the row count where the column greater than value. func (c *Column) GreaterRowCount(sc *variable.StatementContext, value types.Datum) (int64, error) { - if len(c.Numbers) == 0 { + if len(c.Buckets) == 0 { return pseudoRowCount / pseudoLessRate, nil } - index, match, err := c.search(sc, value) + lessCount, err := c.LessRowCount(sc, value) if err != nil { return 0, errors.Trace(err) } - if index == 0 { - return c.totalRowCount(), nil - } - if index >= len(c.Numbers) { - return 0, nil - } - number := c.Numbers[index] - nextNumber := int64(0) - if index < len(c.Numbers)-1 { - nextNumber = c.Numbers[index+1] + eqCount, err := c.EqualRowCount(sc, value) + if err != nil { + return 0, errors.Trace(err) } - greaterThanBucketValueCount := number - c.Repeats[index] - if match { - return greaterThanBucketValueCount, nil + gtCount := c.totalRowCount() - lessCount - eqCount + if gtCount < 0 { + gtCount = 0 } - return (nextNumber + greaterThanBucketValueCount) / 2, nil + return gtCount, nil } // LessRowCount estimates the row count where the column less than value. func (c *Column) LessRowCount(sc *variable.StatementContext, value types.Datum) (int64, error) { - if len(c.Numbers) == 0 { + if len(c.Buckets) == 0 { return pseudoRowCount / pseudoLessRate, nil } - index, match, err := c.search(sc, value) + index, match, err := c.lowerBound(sc, value) if err != nil { return 0, errors.Trace(err) } - if index == len(c.Numbers) { + if index == len(c.Buckets) { return c.totalRowCount(), nil } - number := c.Numbers[index] - prevNumber := int64(0) + curCount := c.Buckets[index].Count + prevCount := int64(0) if index > 0 { - prevNumber = c.Numbers[index-1] + prevCount = c.Buckets[index-1].Count } - lessThanBucketValueCount := number - c.Repeats[index] + lessThanBucketValueCount := curCount - c.Buckets[index].Repeats if match { return lessThanBucketValueCount, nil } - return (prevNumber + lessThanBucketValueCount) / 2, nil + return (prevCount + lessThanBucketValueCount) / 2, nil } // BetweenRowCount estimates the row count where column greater or equal to a and less than b. func (c *Column) BetweenRowCount(sc *variable.StatementContext, a, b types.Datum) (int64, error) { - if len(c.Numbers) == 0 { + if len(c.Buckets) == 0 { return pseudoRowCount / pseudoBetweenRate, nil } lessCountA, err := c.LessRowCount(sc, a) @@ -209,20 +204,21 @@ func (c *Column) BetweenRowCount(sc *variable.StatementContext, a, b types.Datum } func (c *Column) totalRowCount() int64 { - return c.Numbers[len(c.Numbers)-1] + 1 + return c.Buckets[len(c.Buckets)-1].Count } func (c *Column) bucketRowCount() int64 { - return c.totalRowCount() / int64(len(c.Numbers)) + return c.totalRowCount() / int64(len(c.Buckets)) } func (c *Column) inBucketBetweenCount() int64 { + // TODO: Make this estimation more accurate using uniform spread assumption. return c.bucketRowCount()/3 + 1 } -func (c *Column) search(sc *variable.StatementContext, target types.Datum) (index int, match bool, err error) { - index = sort.Search(len(c.Values), func(i int) bool { - cmp, err1 := c.Values[i].CompareDatum(sc, target) +func (c *Column) lowerBound(sc *variable.StatementContext, target types.Datum) (index int, match bool, err error) { + index = sort.Search(len(c.Buckets), func(i int) bool { + cmp, err1 := c.Buckets[i].Value.CompareDatum(sc, target) if err1 != nil { err = errors.Trace(err1) return false @@ -239,19 +235,17 @@ func (c *Column) search(sc *variable.StatementContext, target types.Datum) (inde func (c *Column) mergeBuckets(bucketIdx int64) { curBuck := 0 for i := int64(0); i+1 <= bucketIdx; i += 2 { - c.Numbers[curBuck] = c.Numbers[i+1] - c.Values[curBuck] = c.Values[i+1] - c.Repeats[curBuck] = c.Repeats[i+1] + c.Buckets[curBuck] = bucket{ + Count: c.Buckets[i+1].Count, + Value: c.Buckets[i+1].Value, + Repeats: c.Buckets[i+1].Repeats, + } curBuck++ } if bucketIdx%2 == 0 { - c.Numbers[curBuck] = c.Numbers[bucketIdx] - c.Values[curBuck] = c.Values[bucketIdx] - c.Repeats[curBuck] = c.Repeats[bucketIdx] + c.Buckets[curBuck] = c.Buckets[bucketIdx] curBuck++ } - c.Numbers = c.Numbers[:curBuck] - c.Values = c.Values[:curBuck] - c.Repeats = c.Repeats[:curBuck] + c.Buckets = c.Buckets[:curBuck] return } diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index 5fc7c6f5ec9cf..9265acbc32d14 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -169,15 +169,28 @@ func (s *testStatisticsSuite) TestTable(c *C) { c.Check(err, IsNil) col := t.Columns[0] + c.Check(len(col.Buckets), Equals, 232) count, err := col.EqualRowCount(sc, types.NewIntDatum(1000)) c.Check(err, IsNil) c.Check(count, Equals, int64(1)) count, err = col.LessRowCount(sc, types.NewIntDatum(2000)) c.Check(err, IsNil) - c.Check(count, Equals, int64(19955)) + c.Check(count, Equals, int64(19964)) + count, err = col.GreaterRowCount(sc, types.NewIntDatum(2000)) + c.Check(err, IsNil) + c.Check(count, Equals, int64(80035)) + count, err = col.LessRowCount(sc, types.NewIntDatum(200000000)) + c.Check(err, IsNil) + c.Check(count, Equals, int64(100000)) + count, err = col.GreaterRowCount(sc, types.NewIntDatum(200000000)) + c.Check(err, IsNil) + c.Check(count, Equals, int64(0)) + count, err = col.EqualRowCount(sc, types.NewIntDatum(200000000)) + c.Check(err, IsNil) + c.Check(count, Equals, int64(0)) count, err = col.BetweenRowCount(sc, types.NewIntDatum(3000), types.NewIntDatum(3500)) c.Check(err, IsNil) - c.Check(count, Equals, int64(5075)) + c.Check(count, Equals, int64(5079)) col = t.Columns[1] count, err = col.EqualRowCount(sc, types.NewIntDatum(10000)) @@ -185,7 +198,7 @@ func (s *testStatisticsSuite) TestTable(c *C) { c.Check(count, Equals, int64(1)) count, err = col.LessRowCount(sc, types.NewIntDatum(20000)) c.Check(err, IsNil) - c.Check(count, Equals, int64(19984)) + c.Check(count, Equals, int64(19983)) count, err = col.BetweenRowCount(sc, types.NewIntDatum(30000), types.NewIntDatum(35000)) c.Check(err, IsNil) c.Check(count, Equals, int64(4618)) @@ -196,7 +209,7 @@ func (s *testStatisticsSuite) TestTable(c *C) { c.Check(count, Equals, int64(1)) count, err = col.LessRowCount(sc, types.NewIntDatum(20000)) c.Check(err, IsNil) - c.Check(count, Equals, int64(20224)) + c.Check(count, Equals, int64(20223)) count, err = col.BetweenRowCount(sc, types.NewIntDatum(30000), types.NewIntDatum(35000)) c.Check(err, IsNil) c.Check(count, Equals, int64(5120)) diff --git a/statistics/statscache_test.go b/statistics/statscache_test.go index 11f2fb05fe95d..9dbb9b2be257e 100644 --- a/statistics/statscache_test.go +++ b/statistics/statscache_test.go @@ -95,11 +95,9 @@ func compareTwoColumnsStatsSlice(cols0 []*statistics.Column, cols1 []*statistics for _, col1 := range cols1 { if col0.ID == col1.ID { c.Assert(col0.NDV, Equals, col1.NDV) - c.Assert(len(col0.Numbers), Equals, len(col1.Numbers)) - for j := 0; j < len(col0.Numbers); j++ { - c.Assert(col0.Numbers[j], Equals, col1.Numbers[j]) - c.Assert(col0.Repeats[j], Equals, col1.Repeats[j]) - c.Assert(col0.Values[j], DeepEquals, col1.Values[j]) + c.Assert(len(col0.Buckets), Equals, len(col1.Buckets)) + for j := 0; j < len(col0.Buckets); j++ { + c.Assert(col0.Buckets[j], DeepEquals, col1.Buckets[j]) } find = true break