From dfe6238b2629e5bae5764602278115220f36eed6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 11 Aug 2023 23:03:27 +0800 Subject: [PATCH] statistics: reduce memory usage when to MergePartTopN2GlobalTopN (#45718) (#45970) close pingcap/tidb#45727 --- statistics/cmsketch.go | 16 +--------------- statistics/histogram.go | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 9c5c8bfdfa547..9406d9eb7a5b2 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -743,10 +743,7 @@ func MergePartTopN2GlobalTopN(loc *time.Location, version int, topNs []*TopN, n if checkEmptyTopNs(topNs) { return nil, nil, hists, nil } - partNum := len(topNs) - removeVals := make([][]TopNMeta, partNum) - // Different TopN structures may hold the same value, we have to merge them. counter := make(map[hack.MutableString]float64) // datumMap is used to store the mapping from the string type to datum type. @@ -808,22 +805,11 @@ func MergePartTopN2GlobalTopN(loc *time.Location, version int, topNs []*TopN, n if count != 0 { counter[encodedVal] += count // Remove the value corresponding to encodedVal from the histogram. - removeVals[j] = append(removeVals[j], TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)}) + hists[j].BinarySearchRemoveVal(TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)}) } } } } - // Remove the value from the Hists. - for i := 0; i < partNum; i++ { - if len(removeVals[i]) > 0 { - tmp := removeVals[i] - slices.SortFunc(tmp, func(i, j TopNMeta) bool { - cmpResult := bytes.Compare(i.Encoded, j.Encoded) - return cmpResult < 0 - }) - hists[i].RemoveVals(tmp) - } - } numTop := len(counter) if numTop == 0 { return nil, nil, hists, nil diff --git a/statistics/histogram.go b/statistics/histogram.go index 95c7d3350768a..a9aedf8a1d247 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -280,6 +280,35 @@ func (hg *Histogram) BucketToString(bktID, idxCols int) string { return fmt.Sprintf("num: %d lower_bound: %s upper_bound: %s repeats: %d ndv: %d", hg.bucketCount(bktID), lowerVal, upperVal, hg.Buckets[bktID].Repeat, hg.Buckets[bktID].NDV) } +// BinarySearchRemoveVal removes the value from the TopN using binary search. +func (hg *Histogram) BinarySearchRemoveVal(valCntPairs TopNMeta) { + lowIdx, highIdx := 0, hg.Len()-1 + for lowIdx <= highIdx { + midIdx := (lowIdx + highIdx) / 2 + cmpResult := bytes.Compare(hg.Bounds.Column(0).GetRaw(midIdx*2), valCntPairs.Encoded) + if cmpResult > 0 { + lowIdx = midIdx + 1 + continue + } + cmpResult = bytes.Compare(hg.Bounds.Column(0).GetRaw(midIdx*2+1), valCntPairs.Encoded) + if cmpResult < 0 { + highIdx = midIdx - 1 + continue + } + if hg.Buckets[midIdx].NDV > 0 { + hg.Buckets[midIdx].NDV-- + } + if cmpResult == 0 { + hg.Buckets[midIdx].Repeat = 0 + } + hg.Buckets[midIdx].Count -= int64(valCntPairs.Count) + if hg.Buckets[midIdx].Count < 0 { + hg.Buckets[midIdx].Count = 0 + } + break + } +} + // RemoveVals remove the given values from the histogram. // This function contains an **ASSUMPTION**: valCntPairs is sorted in ascending order. func (hg *Histogram) RemoveVals(valCntPairs []TopNMeta) {