Merge cockroachdb#66891 cockroachdb#66995 cockroachdb#67020 cockroach…

…db#67022 66891: admission: improve slot and grant chain heuristics r=sumeerbhola a=sumeerbhola The existing heuristic was ok with reducing the runnable goroutines while maintaining ~94% cpu utilization under the kv50 overload heuristic. However, when examining the behavior at 1ms granularity: - Running "perf sched record" and "perf sched map" there were occasional ~10ms intervals of time where ~6 of the 8 cpus become idle. - Logging in GrantCoordinator.CPULoad indicated that the immediate termination of a grant chain at every 1ms tick caused the granter to lose control over the runnable goroutines when there was limited KV work (KV work uses slots) but lots of SQL work (SQL work here is shorthand for KV=>SQL response work, which uses tokens). At every 1ms tick of CPULoad, the previous grant chain would be terminated and a new one started that would admit 64 SQL work units (procs * admission.kv_slot_adjuster.overload_threshold = 8 * 8). The kv50 overload workload has a concurrency of 8192 and within 100ms one can admit 6400 of this work even though much of it has not completed (since work that uses tokens does not have a termination signal). The runnable count (all numbers here are aggregate across all 8 cpus) would increase to > 4000, due to which the total slots for KV work would start getting decreased until it reached the minimum of 1 (since the decreases were not helping to reduce runnable). Eventually runnable would come down by itself since the SQL work items finished at which point the runnable would become 0 since the total slots were still at 1. The total slots would increase by 1 slot every 1ms until we built up enough runnable to use all cpus. There are 2 related changes made here: - There is a 100ms lag introduced for grant chain termination. A grant chain is terminated only when the oldest attempt to terminate it is > 100ms old. This means the throttling introduced by the grant chain mechanism actually functions since the same grant chain is active for long. - The default of admission.kv_slot_adjuster.overload_threshold is bumped up to 32 and a grant chain uses this value divided by 4 as a multiplier. This allows a grant chain to still burst with the same burst size as before but ensures that a single burst does not trigger the runnable count to be high enough such that total slots start getting decreased. This did not increase the mean CPU utilization (still ~94%) but there are other improvements based on examining at 1ms intervals. The runnable count rarely becomes < 10. Even when it does, the currently used KV slots is > 200, which suggests that we are running into the limits of what control we can exercise without changing the scheduler (the KV work is probably waiting on IO, which is not observable to the admission control system). Despite a 4x higher admission.kv_slot_adjuster.overload_threshold, which results in total KV slots ~400, the peak runnable ~800, when it used to be ~4000. Some screenshots with `kv50/enc=false/nodes=1/conc=8192` (admission control was turned on between 14:45-14:46) <img width="783" alt="Screen Shot 2021-06-25 at 10 51 30 AM" src="https://user-images.githubusercontent.com/54990988/123456792-40eaf600-d5b1-11eb-8771-5e08f54dd660.png"> <img width="756" alt="Screen Shot 2021-06-25 at 10 51 51 AM" src="https://user-images.githubusercontent.com/54990988/123459874-d9cf4080-d5b4-11eb-8668-6f5190c4f94c.png"> <img width="787" alt="Screen Shot 2021-06-25 at 10 52 06 AM" src="https://user-images.githubusercontent.com/54990988/123459898-dfc52180-d5b4-11eb-8ce2-379af4912bed.png"> <img width="764" alt="Screen Shot 2021-06-25 at 10 52 24 AM" src="https://user-images.githubusercontent.com/54990988/123459918-e6ec2f80-d5b4-11eb-9ba4-a228df1068d4.png"> Release note: None 66995: roachtest: bump version for 20.2 to 20.2.12 r=tbg a=ajwerner Part of the release process. Relates to cockroachdb#66627. Release note: None 67020: AUTHORS: add kpatron r=kpatron-cockroachlabs a=kpatron-cockroachlabs 67022: Update AUTHORS r=ZhouXing19 a=ZhouXing19 Added Jane's name and email Co-authored-by: sumeerbhola <[email protected]> Co-authored-by: Andrew Werner <[email protected]> Co-authored-by: Kyle Patron <[email protected]> Co-authored-by: Zhou Xing <[email protected]>
stevendanna · Jun 29, 2021 · 8e8637d · 8e8637d
5 parents b323e68 + af72db7 + fbf2bc1 + ed77461 + bad105f
commit 8e8637d
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 15 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -168,6 +168,7 @@ Jackson Owens <[email protected]> <[email protected]>
 James Graves <[email protected]>
 James H. Linder <[email protected]>
 Jan Owsiany <[email protected]>
+Jane Xing <[email protected]> <[email protected]> 
 Jason E. Aten <[email protected]>
 Jason Young <[email protected]>
 Jay Kominek <[email protected]>
@@ -220,6 +221,7 @@ Kevin Kokomani <[email protected]> <[email protected]>
 Kevin Zheng <[email protected]> <[email protected]>
 kiran <[email protected]>
 Kristy Gao <[email protected]> gaoxk <[email protected]>
+Kyle Patron <[email protected]>
 lanzao <[email protected]>
 Lasantha Pambagoda <[email protected]>
 Lauren Hirata <[email protected]> Lauren <[email protected]> lhirata <[email protected]>

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -1244,7 +1244,7 @@ func PredecessorVersion(buildVersion version.Version) (string, error) {
 	// map.
 	verMap := map[string]string{
 		"21.2": "21.1.3",
-		"21.1": "20.2.10",
+		"21.1": "20.2.12",
 		"20.2": "20.1.16",
 		"20.1": "19.2.11",
 		"19.2": "19.1.11",

diff --git a/pkg/util/admission/granter.go b/pkg/util/admission/granter.go
@@ -33,7 +33,7 @@ var KVSlotAdjusterOverloadThreshold = settings.RegisterIntSetting(
 	"admission.kv_slot_adjuster.overload_threshold",
 	"when the number of runnable goroutines per CPU is greater than this threshold, the "+
 		"slot adjuster considers the cpu to be overloaded",
-	8, settings.PositiveInt)
+	32, settings.PositiveInt)
 
 // grantChainID is the ID for a grant chain. See continueGrantChain for
 // details.
@@ -566,6 +566,8 @@ type GrantCoordinator struct {
 	// Index into granters, which represents the current WorkKind at which the
 	// grant chain is operating. Only relevant when grantChainActive is true.
 	grantChainIndex WorkKind
+	// See the comment at delayForGrantChainTermination for motivation.
+	grantChainStartTime time.Time
 }
 
 var _ CPULoadListener = &GrantCoordinator{}
@@ -855,9 +857,8 @@ func (coord *GrantCoordinator) CPULoad(runnable int, procs int) {
 	coord.cpuLoadListener.CPULoad(runnable, procs)
 	coord.granters[SQLKVResponseWork].(*tokenGranter).refillBurstTokens()
 	coord.granters[SQLSQLResponseWork].(*tokenGranter).refillBurstTokens()
-	if coord.grantChainActive {
-		coord.grantChainID++
-		coord.grantChainActive = false
+	if coord.grantChainActive && !coord.tryTerminateGrantChain() {
+		return
 	}
 	coord.tryGrant()
 }
@@ -882,7 +883,7 @@ func (coord *GrantCoordinator) tryGet(workKind WorkKind) bool {
 		// grant chain. We don't want it to continue granting to lower priority
 		// WorkKinds, while a higher priority one is waiting, so we terminate it.
 		if coord.grantChainActive && coord.grantChainIndex >= workKind {
-			coord.terminateGrantChain()
+			coord.tryTerminateGrantChain()
 		}
 		return false
 	case grantFailLocal:
@@ -898,10 +899,13 @@ func (coord *GrantCoordinator) returnGrant(workKind WorkKind) {
 	defer coord.mu.Unlock()
 	coord.granters[workKind].returnGrantLocked()
 	if coord.grantChainActive {
-		if coord.grantChainIndex > workKind && coord.granters[workKind].getPairedRequester().hasWaitingRequests() {
+		if coord.grantChainIndex > workKind &&
+			coord.granters[workKind].getPairedRequester().hasWaitingRequests() {
 			// There are waiting requests that will not be served by the grant chain.
 			// Better to terminate it and start afresh.
-			coord.terminateGrantChain()
+			if !coord.tryTerminateGrantChain() {
+				return
+			}
 		} else {
 			// Else either the grant chain will get to this workKind, or there are no waiting requests.
 			return
@@ -933,14 +937,39 @@ func (coord *GrantCoordinator) continueGrantChain(workKind WorkKind, grantChainI
 	coord.tryGrant()
 }
 
-// terminateGrantChain terminates the current grant chain. A new one can
-// be immediately started.
+// delayForGrantChainTermination causes a delay in terminating a grant chain.
+// Terminating a grant chain immediately typically causes a new one to start
+// immediately that can burst up to its maximum initial grant burst. Which
+// means frequent terminations followed by new starts impose little control
+// over the rate at which tokens are granted (slots are better controlled
+// since we know when the work finishes). This causes huge spikes in the
+// runnable goroutine count, observed at 1ms granularity. This spike causes
+// the kvSlotAdjuster to ratchet down the totalSlots for KV work all the way
+// down to 1, which later causes the runnable gorouting count to crash down
+// to a value close to 0, leading to under-utilization.
+//
+// TODO(sumeer): design admission behavior metrics that can be used to
+// understand the behavior in detail and to quantify improvements when changing
+// heuristics. One metric would be mean and variance of the runnable count,
+// computed using the 1ms samples, and exported/logged every 60s.
+var delayForGrantChainTermination = 100 * time.Millisecond
+
+// tryTerminateGrantChain attempts to terminate the current grant chain, and
+// returns true iff it is terminated, in which case a new one can be
+// immediately started.
 // REQUIRES: coord.grantChainActive==true
-func (coord *GrantCoordinator) terminateGrantChain() {
+func (coord *GrantCoordinator) tryTerminateGrantChain() bool {
+	now := timeutil.Now()
+	if delayForGrantChainTermination > 0 &&
+		now.Sub(coord.grantChainStartTime) < delayForGrantChainTermination {
+		return false
+	}
 	// Incrementing the ID will cause the existing grant chain to die out when
 	// the grantee calls continueGrantChain.
 	coord.grantChainID++
 	coord.grantChainActive = false
+	coord.grantChainStartTime = time.Time{}
+	return true
 }
 
 // tryGrant tries to either continue an existing grant chain, or tries to
@@ -959,10 +988,16 @@ func (coord *GrantCoordinator) tryGrant() {
 	// Grant in a burst proportional to numProcs, to generate a runnable for
 	// each.
 	grantBurstLimit := coord.numProcs
-	multiplier := int(KVSlotAdjusterOverloadThreshold.Get(&coord.settings.SV))
-	// Additionally, increase the burst size proportional to the overload
-	// threshold. We experimentally observed that this resulted in better CPU
-	// utilization.
+	// Additionally, increase the burst size proportional to a fourth of the
+	// overload threshold. We experimentally observed that this resulted in
+	// better CPU utilization. We don't use the full overload threshold since we
+	// don't want to over grant for non-KV work since that causes the KV slots
+	// to (unfairly) start decreasing, since we lose control over how many
+	// goroutines are runnable.
+	multiplier := int(KVSlotAdjusterOverloadThreshold.Get(&coord.settings.SV) / 4)
+	if multiplier == 0 {
+		multiplier = 1
+	}
 	grantBurstLimit *= multiplier
 	// Only the case of a grant chain being active returns from within the
 	// OuterLoop.
@@ -985,6 +1020,9 @@ OuterLoop:
 					grantBurstCount++
 					if grantBurstCount == grantBurstLimit {
 						coord.grantChainActive = true
+						if startingChain {
+							coord.grantChainStartTime = timeutil.Now()
+						}
 						return
 					}
 				}

diff --git a/pkg/util/admission/granter_test.go b/pkg/util/admission/granter_test.go
@@ -124,6 +124,7 @@ func TestGranterBasic(t *testing.T) {
 				requesters[workKind] = req
 				return req
 			}
+			delayForGrantChainTermination = 0
 			coord, _ = NewGrantCoordinator(opts)
 			return flushAndReset()