From a0d24096ca893244ae7c3eab33e2c148ae8679b8 Mon Sep 17 00:00:00 2001
From: xzhangxian1008 <xzhangxian@foxmail.com>
Date: Fri, 3 Nov 2023 10:59:09 +0800
Subject: [PATCH] executor: improve parallel hash aggregation (#47428)

close pingcap/tidb#47427
---
 .../aggregate/agg_hash_base_worker.go         |   2 +-
 pkg/executor/aggregate/agg_hash_executor.go   |  39 ++++--
 .../aggregate/agg_hash_final_worker.go        | 101 +++++++-------
 .../aggregate/agg_hash_partial_worker.go      | 106 +++++++++------
 pkg/executor/aggregate/agg_util.go            |  15 +--
 pkg/executor/test/aggregate/BUILD.bazel       |   2 +-
 pkg/executor/test/aggregate/aggregate_test.go | 123 ++++++++++++++++++
 7 files changed, 272 insertions(+), 116 deletions(-)

diff --git a/pkg/executor/aggregate/agg_hash_base_worker.go b/pkg/executor/aggregate/agg_hash_base_worker.go
index 67fa32f356a26..b2ed73cd06f0a 100644
--- a/pkg/executor/aggregate/agg_hash_base_worker.go
+++ b/pkg/executor/aggregate/agg_hash_base_worker.go
@@ -84,7 +84,7 @@ func (w *baseHashAggWorker) getPartialResult(_ *stmtctx.StatementContext, groupK
 
 func (w *baseHashAggWorker) getPartialResultSliceLenConsiderByteAlign() int {
 	length := len(w.aggFuncs)
-	if len(w.aggFuncs) == 1 {
+	if length == 1 {
 		return 1
 	}
 	return length + length&1
diff --git a/pkg/executor/aggregate/agg_hash_executor.go b/pkg/executor/aggregate/agg_hash_executor.go
index 9c4283dc6920b..a3bc512ece2fb 100644
--- a/pkg/executor/aggregate/agg_hash_executor.go
+++ b/pkg/executor/aggregate/agg_hash_executor.go
@@ -104,7 +104,7 @@ type HashAggExec struct {
 
 	finishCh         chan struct{}
 	finalOutputCh    chan *AfFinalResult
-	partialOutputChs []chan *HashAggIntermData
+	partialOutputChs []chan *AggPartialResultMapper
 	inputCh          chan *HashAggInput
 	partialInputChs  []chan *chunk.Chunk
 	partialWorkers   []HashAggPartialWorker
@@ -264,9 +264,9 @@ func (e *HashAggExec) initForParallelExec(_ sessionctx.Context) {
 	for i := range e.partialInputChs {
 		e.partialInputChs[i] = make(chan *chunk.Chunk, 1)
 	}
-	e.partialOutputChs = make([]chan *HashAggIntermData, finalConcurrency)
+	e.partialOutputChs = make([]chan *AggPartialResultMapper, finalConcurrency)
 	for i := range e.partialOutputChs {
-		e.partialOutputChs[i] = make(chan *HashAggIntermData, partialConcurrency)
+		e.partialOutputChs[i] = make(chan *AggPartialResultMapper, partialConcurrency)
 	}
 
 	e.partialWorkers = make([]HashAggPartialWorker, partialConcurrency)
@@ -275,17 +275,30 @@ func (e *HashAggExec) initForParallelExec(_ sessionctx.Context) {
 
 	// Init partial workers.
 	for i := 0; i < partialConcurrency; i++ {
+		partialResultsMap := make([]AggPartialResultMapper, finalConcurrency)
+		for i := 0; i < finalConcurrency; i++ {
+			partialResultsMap[i] = make(AggPartialResultMapper)
+		}
+
 		w := HashAggPartialWorker{
-			baseHashAggWorker: newBaseHashAggWorker(e.Ctx(), e.finishCh, e.PartialAggFuncs, e.MaxChunkSize(), e.memTracker),
-			inputCh:           e.partialInputChs[i],
-			outputChs:         e.partialOutputChs,
-			giveBackCh:        e.inputCh,
-			globalOutputCh:    e.finalOutputCh,
-			partialResultsMap: make(AggPartialResultMapper),
-			groupByItems:      e.GroupByItems,
-			chk:               exec.TryNewCacheChunk(e.Children(0)),
-			groupKey:          make([][]byte, 0, 8),
+			baseHashAggWorker:    newBaseHashAggWorker(e.Ctx(), e.finishCh, e.PartialAggFuncs, e.MaxChunkSize(), e.memTracker),
+			inputCh:              e.partialInputChs[i],
+			outputChs:            e.partialOutputChs,
+			giveBackCh:           e.inputCh,
+			BInMaps:              make([]int, finalConcurrency),
+			partialResultsBuffer: make([][]aggfuncs.PartialResult, 0, 2048),
+			globalOutputCh:       e.finalOutputCh,
+			partialResultsMap:    partialResultsMap,
+			groupByItems:         e.GroupByItems,
+			chk:                  exec.TryNewCacheChunk(e.Children(0)),
+			groupKey:             make([][]byte, 0, 8),
 		}
+
+		w.partialResultNumInRow = w.getPartialResultSliceLenConsiderByteAlign()
+		for i := 0; i < finalConcurrency; i++ {
+			w.BInMaps[i] = 0
+		}
+
 		// There is a bucket in the empty partialResultsMap.
 		failpoint.Inject("ConsumeRandomPanic", nil)
 		e.memTracker.Consume(hack.DefBucketMemoryUsageForMapStrToSlice * (1 << w.BInMap))
@@ -309,6 +322,8 @@ func (e *HashAggExec) initForParallelExec(_ sessionctx.Context) {
 		w := HashAggFinalWorker{
 			baseHashAggWorker:   newBaseHashAggWorker(e.Ctx(), e.finishCh, e.FinalAggFuncs, e.MaxChunkSize(), e.memTracker),
 			partialResultMap:    make(AggPartialResultMapper),
+			BInMap:              0,
+			isFirstInput:        true,
 			groupSet:            groupSet,
 			inputCh:             e.partialOutputChs[i],
 			outputCh:            e.finalOutputCh,
diff --git a/pkg/executor/aggregate/agg_hash_final_worker.go b/pkg/executor/aggregate/agg_hash_final_worker.go
index a2e9f2edae1f5..ab139f491508a 100644
--- a/pkg/executor/aggregate/agg_hash_final_worker.go
+++ b/pkg/executor/aggregate/agg_hash_final_worker.go
@@ -19,10 +19,10 @@ import (
 	"time"
 
 	"github.com/pingcap/failpoint"
-	"github.com/pingcap/tidb/pkg/executor/aggfuncs"
 	"github.com/pingcap/tidb/pkg/sessionctx"
 	"github.com/pingcap/tidb/pkg/types"
 	"github.com/pingcap/tidb/pkg/util/chunk"
+	"github.com/pingcap/tidb/pkg/util/hack"
 	"github.com/pingcap/tidb/pkg/util/logutil"
 	"github.com/pingcap/tidb/pkg/util/set"
 	"go.uber.org/zap"
@@ -43,14 +43,16 @@ type HashAggFinalWorker struct {
 	rowBuffer           []types.Datum
 	mutableRow          chunk.MutRow
 	partialResultMap    AggPartialResultMapper
+	BInMap              int
+	isFirstInput        bool
 	groupSet            set.StringSetWithMemoryUsage
-	inputCh             chan *HashAggIntermData
+	inputCh             chan *AggPartialResultMapper
 	outputCh            chan *AfFinalResult
 	finalResultHolderCh chan *chunk.Chunk
 	groupKeys           [][]byte
 }
 
-func (w *HashAggFinalWorker) getPartialInput() (input *HashAggIntermData, ok bool) {
+func (w *HashAggFinalWorker) getPartialInput() (input *AggPartialResultMapper, ok bool) {
 	select {
 	case <-w.finishCh:
 		return nil, false
@@ -62,55 +64,60 @@ func (w *HashAggFinalWorker) getPartialInput() (input *HashAggIntermData, ok boo
 	return
 }
 
+func (w *HashAggFinalWorker) initBInMap() {
+	w.BInMap = 0
+	mapLen := len(w.partialResultMap)
+	for mapLen > (1<<w.BInMap)*hack.LoadFactorNum/hack.LoadFactorDen {
+		w.BInMap++
+	}
+}
+
 func (w *HashAggFinalWorker) consumeIntermData(sctx sessionctx.Context) (err error) {
-	var (
-		input            *HashAggIntermData
-		ok               bool
-		intermDataBuffer [][]aggfuncs.PartialResult
-		groupKeys        []string
-		sc               = sctx.GetSessionVars().StmtCtx
-	)
 	for {
 		waitStart := time.Now()
-		input, ok = w.getPartialInput()
+		input, ok := w.getPartialInput()
 		if w.stats != nil {
 			w.stats.WaitTime += int64(time.Since(waitStart))
 		}
 		if !ok {
 			return nil
 		}
-		execStart := time.Now()
-		if intermDataBuffer == nil {
-			intermDataBuffer = make([][]aggfuncs.PartialResult, 0, w.maxChunkSize)
+
+		// As the w.partialResultMap is empty when we get the first input.
+		// So it's better to directly assign the input to w.partialResultMap
+		if w.isFirstInput {
+			w.isFirstInput = false
+			w.partialResultMap = *input
+			w.initBInMap()
+			continue
 		}
-		// Consume input in batches, size of every batch is less than w.maxChunkSize.
-		for reachEnd := false; !reachEnd; {
-			intermDataBuffer, groupKeys, reachEnd = input.getPartialResultBatch(sc, intermDataBuffer[:0], w.aggFuncs, w.maxChunkSize)
-			groupKeysLen := len(groupKeys)
-			memSize := getGroupKeyMemUsage(w.groupKeys)
-			w.groupKeys = w.groupKeys[:0]
-			for i := 0; i < groupKeysLen; i++ {
-				w.groupKeys = append(w.groupKeys, []byte(groupKeys[i]))
-			}
-			failpoint.Inject("ConsumeRandomPanic", nil)
-			w.memTracker.Consume(getGroupKeyMemUsage(w.groupKeys) - memSize)
-			finalPartialResults := w.getPartialResult(sc, w.groupKeys, w.partialResultMap)
-			allMemDelta := int64(0)
-			for i, groupKey := range groupKeys {
-				if !w.groupSet.Exist(groupKey) {
-					allMemDelta += w.groupSet.Insert(groupKey)
+
+		failpoint.Inject("ConsumeRandomPanic", nil)
+
+		execStart := time.Now()
+		allMemDelta := int64(0)
+		for key, value := range *input {
+			dstVal, ok := w.partialResultMap[key]
+			if !ok {
+				// Map will expand when count > bucketNum * loadFactor. The memory usage will double.
+				if len(w.partialResultMap)+1 > (1<<w.BInMap)*hack.LoadFactorNum/hack.LoadFactorDen {
+					w.memTracker.Consume(hack.DefBucketMemoryUsageForMapStrToSlice * (1 << w.BInMap))
+					w.BInMap++
 				}
-				prs := intermDataBuffer[i]
-				for j, af := range w.aggFuncs {
-					memDelta, err := af.MergePartialResult(sctx, prs[j], finalPartialResults[i][j])
-					if err != nil {
-						return err
-					}
-					allMemDelta += memDelta
+				w.partialResultMap[key] = value
+				continue
+			}
+
+			for j, af := range w.aggFuncs {
+				memDelta, err := af.MergePartialResult(sctx, value[j], dstVal[j])
+				if err != nil {
+					return err
 				}
+				allMemDelta += memDelta
 			}
-			w.memTracker.Consume(allMemDelta)
 		}
+		w.memTracker.Consume(allMemDelta)
+
 		if w.stats != nil {
 			w.stats.ExecTime += int64(time.Since(execStart))
 			w.stats.TaskNum++
@@ -127,24 +134,21 @@ func (w *HashAggFinalWorker) loadFinalResult(sctx sessionctx.Context) {
 	if finished {
 		return
 	}
-	execStart := time.Now()
-	memSize := getGroupKeyMemUsage(w.groupKeys)
-	w.groupKeys = w.groupKeys[:0]
-	for groupKey := range w.groupSet.StringSet {
-		w.groupKeys = append(w.groupKeys, []byte(groupKey))
-	}
+
 	failpoint.Inject("ConsumeRandomPanic", nil)
-	w.memTracker.Consume(getGroupKeyMemUsage(w.groupKeys) - memSize)
-	partialResults := w.getPartialResult(sctx.GetSessionVars().StmtCtx, w.groupKeys, w.partialResultMap)
-	for i := 0; i < len(w.groupSet.StringSet); i++ {
+
+	execStart := time.Now()
+	for _, results := range w.partialResultMap {
 		for j, af := range w.aggFuncs {
-			if err := af.AppendFinalResult2Chunk(sctx, partialResults[i][j], result); err != nil {
+			if err := af.AppendFinalResult2Chunk(sctx, results[j], result); err != nil {
 				logutil.BgLogger().Error("HashAggFinalWorker failed to append final result to Chunk", zap.Error(err))
 			}
 		}
+
 		if len(w.aggFuncs) == 0 {
 			result.SetNumVirtualRows(result.NumRows() + 1)
 		}
+
 		if result.IsFull() {
 			w.outputCh <- &AfFinalResult{chk: result, giveBackCh: w.finalResultHolderCh}
 			result, finished = w.receiveFinalResultHolder()
@@ -153,6 +157,7 @@ func (w *HashAggFinalWorker) loadFinalResult(sctx sessionctx.Context) {
 			}
 		}
 	}
+
 	w.outputCh <- &AfFinalResult{chk: result, giveBackCh: w.finalResultHolderCh}
 	if w.stats != nil {
 		w.stats.ExecTime += int64(time.Since(execStart))
diff --git a/pkg/executor/aggregate/agg_hash_partial_worker.go b/pkg/executor/aggregate/agg_hash_partial_worker.go
index 1988d48eab784..d41300bd2d2ac 100644
--- a/pkg/executor/aggregate/agg_hash_partial_worker.go
+++ b/pkg/executor/aggregate/agg_hash_partial_worker.go
@@ -19,30 +19,30 @@ import (
 	"time"
 
 	"github.com/pingcap/failpoint"
+	"github.com/pingcap/tidb/pkg/executor/aggfuncs"
 	"github.com/pingcap/tidb/pkg/expression"
 	"github.com/pingcap/tidb/pkg/sessionctx"
-	"github.com/pingcap/tidb/pkg/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/pkg/util/chunk"
+	"github.com/pingcap/tidb/pkg/util/hack"
 	"github.com/twmb/murmur3"
 )
 
-// HashAggIntermData indicates the intermediate data of aggregation execution.
-type HashAggIntermData struct {
-	groupKeys        []string
-	cursor           int
-	partialResultMap AggPartialResultMapper
-}
-
 // HashAggPartialWorker indicates the partial workers of parallel hash agg execution,
 // the number of the worker can be set by `tidb_hashagg_partial_concurrency`.
 type HashAggPartialWorker struct {
 	baseHashAggWorker
 
-	inputCh           chan *chunk.Chunk
-	outputChs         []chan *HashAggIntermData
-	globalOutputCh    chan *AfFinalResult
-	giveBackCh        chan<- *HashAggInput
-	partialResultsMap AggPartialResultMapper
+	inputCh        chan *chunk.Chunk
+	outputChs      []chan *AggPartialResultMapper
+	globalOutputCh chan *AfFinalResult
+	giveBackCh     chan<- *HashAggInput
+	BInMaps        []int
+
+	partialResultsBuffer  [][]aggfuncs.PartialResult
+	partialResultNumInRow int
+
+	// Length of this map is equal to the number of final workers
+	partialResultsMap []AggPartialResultMapper
 	groupByItems      []expression.Expression
 	groupKey          [][]byte
 	// chk stores the input data from child,
@@ -69,13 +69,13 @@ func (w *HashAggPartialWorker) getChildInput() bool {
 
 func (w *HashAggPartialWorker) run(ctx sessionctx.Context, waitGroup *sync.WaitGroup, finalConcurrency int) {
 	start := time.Now()
-	needShuffle, sc := false, ctx.GetSessionVars().StmtCtx
+	needShuffle := false
 	defer func() {
 		if r := recover(); r != nil {
 			recoveryHashAgg(w.globalOutputCh, r)
 		}
 		if needShuffle {
-			w.shuffleIntermData(sc, finalConcurrency)
+			w.shuffleIntermData(finalConcurrency)
 		}
 		w.memTracker.Consume(-w.chk.MemoryUsage())
 		if w.stats != nil {
@@ -93,7 +93,7 @@ func (w *HashAggPartialWorker) run(ctx sessionctx.Context, waitGroup *sync.WaitG
 			return
 		}
 		execStart := time.Now()
-		if err := w.updatePartialResult(ctx, sc, w.chk, len(w.partialResultsMap)); err != nil {
+		if err := w.updatePartialResult(ctx, w.chk, len(w.partialResultsMap)); err != nil {
 			w.globalOutputCh <- &AfFinalResult{err: err}
 			return
 		}
@@ -107,7 +107,48 @@ func (w *HashAggPartialWorker) run(ctx sessionctx.Context, waitGroup *sync.WaitG
 	}
 }
 
-func (w *HashAggPartialWorker) updatePartialResult(ctx sessionctx.Context, sc *stmtctx.StatementContext, chk *chunk.Chunk, _ int) (err error) {
+// If the group key has appeared before, reuse the partial result.
+// If the group key has not appeared before, create empty partial results.
+func (w *HashAggPartialWorker) getPartialResultsOfEachRow(groupKey [][]byte, finalConcurrency int) [][]aggfuncs.PartialResult {
+	mapper := w.partialResultsMap
+	numRows := len(groupKey)
+	allMemDelta := int64(0)
+	w.partialResultsBuffer = w.partialResultsBuffer[0:0]
+
+	for i := 0; i < numRows; i++ {
+		finalWorkerIdx := int(murmur3.Sum32(groupKey[i])) % finalConcurrency
+		tmp, ok := mapper[finalWorkerIdx][string(hack.String(groupKey[i]))]
+
+		// This group by key has appeared before, reuse the partial result.
+		if ok {
+			w.partialResultsBuffer = append(w.partialResultsBuffer, tmp)
+			continue
+		}
+
+		// It's the first time that this group by key appeared, create it
+		w.partialResultsBuffer = append(w.partialResultsBuffer, make([]aggfuncs.PartialResult, w.partialResultNumInRow))
+		lastIdx := len(w.partialResultsBuffer) - 1
+		for j, af := range w.aggFuncs {
+			partialResult, memDelta := af.AllocPartialResult()
+			w.partialResultsBuffer[lastIdx][j] = partialResult
+			allMemDelta += memDelta // the memory usage of PartialResult
+		}
+		allMemDelta += int64(w.partialResultNumInRow * 8)
+
+		// Map will expand when count > bucketNum * loadFactor. The memory usage will double.
+		if len(mapper[finalWorkerIdx])+1 > (1<<w.BInMaps[finalWorkerIdx])*hack.LoadFactorNum/hack.LoadFactorDen {
+			w.memTracker.Consume(hack.DefBucketMemoryUsageForMapStrToSlice * (1 << w.BInMaps[finalWorkerIdx]))
+			w.BInMaps[finalWorkerIdx]++
+		}
+
+		mapper[finalWorkerIdx][string(groupKey[i])] = w.partialResultsBuffer[lastIdx]
+		allMemDelta += int64(len(groupKey[i]))
+	}
+	w.memTracker.Consume(allMemDelta)
+	return w.partialResultsBuffer
+}
+
+func (w *HashAggPartialWorker) updatePartialResult(ctx sessionctx.Context, chk *chunk.Chunk, finalConcurrency int) (err error) {
 	memSize := getGroupKeyMemUsage(w.groupKey)
 	w.groupKey, err = GetGroupKey(w.ctx, chk, w.groupKey, w.groupByItems)
 	failpoint.Inject("ConsumeRandomPanic", nil)
@@ -116,14 +157,16 @@ func (w *HashAggPartialWorker) updatePartialResult(ctx sessionctx.Context, sc *s
 		return err
 	}
 
-	partialResults := w.getPartialResult(sc, w.groupKey, w.partialResultsMap)
+	partialResultOfEachRow := w.getPartialResultsOfEachRow(w.groupKey, finalConcurrency)
+
 	numRows := chk.NumRows()
 	rows := make([]chunk.Row, 1)
 	allMemDelta := int64(0)
 	for i := 0; i < numRows; i++ {
+		partialResult := partialResultOfEachRow[i]
+		rows[0] = chk.GetRow(i)
 		for j, af := range w.aggFuncs {
-			rows[0] = chk.GetRow(i)
-			memDelta, err := af.UpdatePartialResult(ctx, rows, partialResults[i][j])
+			memDelta, err := af.UpdatePartialResult(ctx, rows, partialResult[j])
 			if err != nil {
 				return err
 			}
@@ -134,25 +177,8 @@ func (w *HashAggPartialWorker) updatePartialResult(ctx sessionctx.Context, sc *s
 	return nil
 }
 
-// shuffleIntermData shuffles the intermediate data of partial workers to corresponded final workers.
-// We only support parallel execution for single-machine, so process of encode and decode can be skipped.
-func (w *HashAggPartialWorker) shuffleIntermData(_ *stmtctx.StatementContext, finalConcurrency int) {
-	groupKeysSlice := make([][]string, finalConcurrency)
-	for groupKey := range w.partialResultsMap {
-		finalWorkerIdx := int(murmur3.Sum32([]byte(groupKey))) % finalConcurrency
-		if groupKeysSlice[finalWorkerIdx] == nil {
-			groupKeysSlice[finalWorkerIdx] = make([]string, 0, len(w.partialResultsMap)/finalConcurrency)
-		}
-		groupKeysSlice[finalWorkerIdx] = append(groupKeysSlice[finalWorkerIdx], groupKey)
-	}
-
-	for i := range groupKeysSlice {
-		if groupKeysSlice[i] == nil {
-			continue
-		}
-		w.outputChs[i] <- &HashAggIntermData{
-			groupKeys:        groupKeysSlice[i],
-			partialResultMap: w.partialResultsMap,
-		}
+func (w *HashAggPartialWorker) shuffleIntermData(finalConcurrency int) {
+	for i := 0; i < finalConcurrency; i++ {
+		w.outputChs[i] <- &w.partialResultsMap[i]
 	}
 }
diff --git a/pkg/executor/aggregate/agg_util.go b/pkg/executor/aggregate/agg_util.go
index 9b7a75edb8acc..5508fa0b2665e 100644
--- a/pkg/executor/aggregate/agg_util.go
+++ b/pkg/executor/aggregate/agg_util.go
@@ -28,7 +28,6 @@ import (
 	"github.com/pingcap/tidb/pkg/parser/mysql"
 	"github.com/pingcap/tidb/pkg/parser/terror"
 	"github.com/pingcap/tidb/pkg/sessionctx"
-	"github.com/pingcap/tidb/pkg/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/pkg/util"
 	"github.com/pingcap/tidb/pkg/util/chunk"
 	"github.com/pingcap/tidb/pkg/util/codec"
@@ -37,18 +36,6 @@ import (
 	"go.uber.org/zap"
 )
 
-// getPartialResultBatch fetches a batch of partial results from HashAggIntermData.
-func (d *HashAggIntermData) getPartialResultBatch(_ *stmtctx.StatementContext, prs [][]aggfuncs.PartialResult, _ []aggfuncs.AggFunc, maxChunkSize int) (_ [][]aggfuncs.PartialResult, groupKeys []string, reachEnd bool) {
-	keyStart := d.cursor
-	for ; d.cursor < len(d.groupKeys) && len(prs) < maxChunkSize; d.cursor++ {
-		prs = append(prs, d.partialResultMap[d.groupKeys[d.cursor]])
-	}
-	if d.cursor == len(d.groupKeys) {
-		reachEnd = true
-	}
-	return prs, d.groupKeys[keyStart:d.cursor], reachEnd
-}
-
 func closeBaseExecutor(b *exec.BaseExecutor) {
 	if r := recover(); r != nil {
 		// Release the resource, but throw the panic again and let the top level handle it.
@@ -122,7 +109,7 @@ func GetGroupKey(ctx sessionctx.Context, input *chunk.Chunk, groupKey [][]byte,
 		}
 		expression.PutColumn(buf)
 	}
-	return groupKey, nil
+	return groupKey[:numRows], nil
 }
 
 // HashAggRuntimeStats record the HashAggExec runtime stat
diff --git a/pkg/executor/test/aggregate/BUILD.bazel b/pkg/executor/test/aggregate/BUILD.bazel
index 575f1ccf534b3..efa11544af7ea 100644
--- a/pkg/executor/test/aggregate/BUILD.bazel
+++ b/pkg/executor/test/aggregate/BUILD.bazel
@@ -9,7 +9,7 @@ go_test(
     ],
     data = glob(["testdata/**"]),
     flaky = True,
-    shard_count = 5,
+    shard_count = 6,
     deps = [
         "//pkg/config",
         "//pkg/executor/aggregate",
diff --git a/pkg/executor/test/aggregate/aggregate_test.go b/pkg/executor/test/aggregate/aggregate_test.go
index 83fcbdd73d510..8f37c62089f51 100644
--- a/pkg/executor/test/aggregate/aggregate_test.go
+++ b/pkg/executor/test/aggregate/aggregate_test.go
@@ -15,6 +15,7 @@
 package aggregate
 
 import (
+	"bytes"
 	"context"
 	"fmt"
 	"math"
@@ -332,3 +333,125 @@ func TestRandomPanicConsume(t *testing.T) {
 		}
 	}
 }
+
+func checkResults(actualRes [][]interface{}, expectedRes map[string]string) bool {
+	if len(actualRes) != len(expectedRes) {
+		return false
+	}
+
+	var key string
+	var expectVal string
+	var actualVal string
+	var ok bool
+	for _, row := range actualRes {
+		if len(row) != 2 {
+			return false
+		}
+
+		key, ok = row[0].(string)
+		if !ok {
+			return false
+		}
+
+		expectVal, ok = expectedRes[key]
+		if !ok {
+			return false
+		}
+
+		actualVal, ok = row[1].(string)
+		if !ok {
+			return false
+		}
+
+		if expectVal != actualVal {
+			return false
+		}
+	}
+	return true
+}
+
+func genListPartition(begin, end int) string {
+	buf := &bytes.Buffer{}
+	buf.WriteString("(")
+	for i := begin; i < end-1; i++ {
+		buf.WriteString(fmt.Sprintf("%v, ", i))
+	}
+	buf.WriteString(fmt.Sprintf("%v)", end-1))
+	return buf.String()
+}
+
+func TestParallelHashAgg(t *testing.T) {
+	store, _ := testkit.CreateMockStoreAndDomain(t)
+	tk := testkit.NewTestKit(t, store)
+
+	tk.MustExec("use test")
+	tk.MustExec("drop table if exists test.parallel_hash_agg;")
+	tk.MustExec("create table test.parallel_hash_agg(k varchar(30), v int);")
+	for i := 0; i < 20; i++ {
+		tk.MustExec("insert into test.parallel_hash_agg (k, v) values ('aa', 1), ('AA', 1), ('aA', 1), ('Aa', 1), ('bb', 1), ('BB', 1), ('bB', 1), ('Bb', 1), ('cc', 1), ('CC', 1), ('cC', 1), ('Cc', 1), ('dd', 1), ('DD', 1), ('dD', 1), ('Dd', 1), ('ee', 1), ('EE', 1), ('eE', 1), ('Ee', 1);")
+	}
+
+	tk.MustExec("set @@tidb_max_chunk_size=32;")
+
+	expectedResult := make(map[string]string)
+	expectedResult["dd"] = "20"
+	expectedResult["AA"] = "20"
+	expectedResult["cc"] = "20"
+	expectedResult["eE"] = "20"
+	expectedResult["bb"] = "20"
+	expectedResult["Cc"] = "20"
+	expectedResult["EE"] = "20"
+	expectedResult["Aa"] = "20"
+	expectedResult["ee"] = "20"
+	expectedResult["Bb"] = "20"
+	expectedResult["dD"] = "20"
+	expectedResult["aa"] = "20"
+	expectedResult["cC"] = "20"
+	expectedResult["DD"] = "20"
+	expectedResult["BB"] = "20"
+	expectedResult["Dd"] = "20"
+	expectedResult["CC"] = "20"
+	expectedResult["bB"] = "20"
+	expectedResult["aA"] = "20"
+	expectedResult["Ee"] = "20"
+	res := tk.MustQuery("select k, sum(v) from parallel_hash_agg group by k;")
+	tk.RequireEqual(true, checkResults(res.Rows(), expectedResult))
+
+	tk.MustExec("create database list_partition_agg")
+	tk.MustExec("use list_partition_agg")
+	tk.MustExec("drop table if exists tlist")
+	tk.MustExec(`set tidb_enable_list_partition = 1`)
+	tk.MustExec(`create table tlist (a int, b int) partition by list(a) (` +
+		` partition p0 values in ` + genListPartition(0, 20) +
+		`, partition p1 values in ` + genListPartition(20, 40) +
+		`, partition p2 values in ` + genListPartition(40, 60) +
+		`, partition p3 values in ` + genListPartition(60, 80) +
+		`, partition p4 values in ` + genListPartition(80, 100) + `)`)
+	tk.MustExec(`create table tnormal (a int, b int)`)
+
+	vals := ""
+	for i := 0; i < 100; i++ {
+		if vals != "" {
+			vals += ", "
+		}
+		vals += fmt.Sprintf("(%v, %v)", rand.Intn(100), rand.Intn(100))
+	}
+	tk.MustExec(`insert into tnormal values ` + vals)
+	tk.MustExec(`insert into tlist values ` + vals)
+
+	for _, aggFunc := range []string{"min", "max", "sum", "count"} {
+		c1, c2 := "a", "b"
+		for i := 0; i < 2; i++ {
+			rs := tk.MustQuery(fmt.Sprintf(`select %v, %v(%v) from tnormal group by %v`, c1, aggFunc, c2, c1)).Sort()
+
+			tk.MustExec("set @@tidb_partition_prune_mode = 'dynamic'")
+			rsDynamic := tk.MustQuery(fmt.Sprintf(`select %v, %v(%v) from tlist group by %v`, c1, aggFunc, c2, c1)).Sort()
+
+			tk.MustExec("set @@tidb_partition_prune_mode = 'static'")
+			rsStatic := tk.MustQuery(fmt.Sprintf(`select %v, %v(%v) from tlist group by %v`, c1, aggFunc, c2, c1)).Sort()
+
+			rs.Check(rsDynamic.Rows())
+			rs.Check(rsStatic.Rows())
+		}
+	}
+}