diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 31abbbc6dc458..4335e90d82b62 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -799,10 +799,7 @@ func MergePartTopN2GlobalTopN(loc *time.Location, version int, topNs []*TopN, n if checkEmptyTopNs(topNs) { return nil, nil, hists, nil } - partNum := len(topNs) - removeVals := make([][]TopNMeta, partNum) - // Different TopN structures may hold the same value, we have to merge them. counter := make(map[hack.MutableString]float64) // datumMap is used to store the mapping from the string type to datum type. @@ -864,22 +861,11 @@ func MergePartTopN2GlobalTopN(loc *time.Location, version int, topNs []*TopN, n if count != 0 { counter[encodedVal] += count // Remove the value corresponding to encodedVal from the histogram. - removeVals[j] = append(removeVals[j], TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)}) + hists[j].BinarySearchRemoveVal(TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)}) } } } } - // Remove the value from the Hists. - for i := 0; i < partNum; i++ { - if len(removeVals[i]) > 0 { - tmp := removeVals[i] - slices.SortFunc(tmp, func(i, j TopNMeta) bool { - cmpResult := bytes.Compare(i.Encoded, j.Encoded) - return cmpResult < 0 - }) - hists[i].RemoveVals(tmp) - } - } numTop := len(counter) if numTop == 0 { return nil, nil, hists, nil diff --git a/statistics/histogram.go b/statistics/histogram.go index d4f7e9ed72447..642836c83a941 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -281,6 +281,35 @@ func (hg *Histogram) BucketToString(bktID, idxCols int) string { return fmt.Sprintf("num: %d lower_bound: %s upper_bound: %s repeats: %d ndv: %d", hg.bucketCount(bktID), lowerVal, upperVal, hg.Buckets[bktID].Repeat, hg.Buckets[bktID].NDV) } +// BinarySearchRemoveVal removes the value from the TopN using binary search. +func (hg *Histogram) BinarySearchRemoveVal(valCntPairs TopNMeta) { + lowIdx, highIdx := 0, hg.Len()-1 + for lowIdx <= highIdx { + midIdx := (lowIdx + highIdx) / 2 + cmpResult := bytes.Compare(hg.Bounds.Column(0).GetRaw(midIdx*2), valCntPairs.Encoded) + if cmpResult > 0 { + lowIdx = midIdx + 1 + continue + } + cmpResult = bytes.Compare(hg.Bounds.Column(0).GetRaw(midIdx*2+1), valCntPairs.Encoded) + if cmpResult < 0 { + highIdx = midIdx - 1 + continue + } + if hg.Buckets[midIdx].NDV > 0 { + hg.Buckets[midIdx].NDV-- + } + if cmpResult == 0 { + hg.Buckets[midIdx].Repeat = 0 + } + hg.Buckets[midIdx].Count -= int64(valCntPairs.Count) + if hg.Buckets[midIdx].Count < 0 { + hg.Buckets[midIdx].Count = 0 + } + break + } +} + // RemoveVals remove the given values from the histogram. // This function contains an **ASSUMPTION**: valCntPairs is sorted in ascending order. func (hg *Histogram) RemoveVals(valCntPairs []TopNMeta) {