Merge #116298 #116980 #116987 #117118

116298: roachtest: improve sqlsmith for an elusive setup error r=mgartner a=yuzefovich **roachtest: log erroneous stmt in sqlsmith.log** Previously, if we hit an error when executing a setup query, we would only include it into the error message (so it would only be seen in the test.log file). This commit makes it so that the erroneous setup query is included into sqlsmith.log as well. Informs: #116160. Informs: #116307. **roachtest/sqlsmith: print a setup query as integers** This commit adjusts the sqlsmith logging so that if it encounters an error during the setup that contains "does not exist" substring, then we will also log the failed stmt as space-separated integers. This change is made in hopes of being able to reproduce the elusive 'pq: column "crdb_internal_idx_expr" does not exist' error we've seen a few times. My hypothesis is that there is some non-visible character that gets lost when stringified. Epic: None Release note: None 116980: pkg/bench: benchmark latency impact of gc when hashing r=dt a=dt Release note: none. Epic: none. 116987: tenantcostclient: add client consumption metrics r=JeffSwenson a=jaylim-crl Previously, consumption metrics were only available in KV servers. These metrics are updated whenever SQL servers send a TokenBucketRequest with the consumption since the last request to KV servers. However, those metrics were insufficient for CockroachDB Cloud's use case. Specifically, for metrics export to work, we need to fetch these data separately from the KV servers, which introduces its own issues. To address that, this commit introduces consumption metrics within the SQL servers. This change allows us to rely solely on metrics exported by the SQL servers, eliminating the need to build something externally to fetch data from the KV servers. It is worth nothing that the aggregated values for these metrics will always be greater than or equal to the values in the KV servers since metrics are updated locally before requests to KV servers. Additionally, this commit also fixes a bug where the existing tenantcostclient metrics, introduced in #113512, were not added to the main registry. As a result, they were unavailable via the status server. Epic: [CC-26682](https://cockroachlabs.atlassian.net/browse/CC-26682) Release note: None 117118: sql/rowcontainer: remove propagation of synthetic timestamp bit r=nvanbenschoten a=nvanbenschoten Informs #101938. This PR removes the propagation of the synthetic flag through `kvStreamerResultDiskBuffer`. It then cleans up the use of synthetic timestamps in a few SQL tests. This flag has been deprecated since v22.2 and is no longer consulted in uncertainty interval checks or by transaction commit-wait. It does not need to be propagated. Release note: None Co-authored-by: Yahor Yuzefovich <[email protected]> Co-authored-by: David Taylor <[email protected]> Co-authored-by: Jay <[email protected]> Co-authored-by: Nathan VanBenschoten <[email protected]>
cockroachdb · Jan 2, 2024 · e89ce89 · e89ce89
5 parents 13ce29e + e06a0bc + a5b2f5f + 97860fd + 3543d30
commit e89ce89
Show file tree

Hide file tree

Showing 19 changed files with 703 additions and 26 deletions.
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
@@ -1489,6 +1489,20 @@
 <tr><td>APPLICATION</td><td>sqlliveness.sessions_deletion_runs</td><td>Number of calls to delete sessions which have been performed</td><td>Sessions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sqlliveness.write_failures</td><td>Number of update or insert calls which have failed</td><td>Writes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sqlliveness.write_successes</td><td>Number of update or insert calls successfully performed</td><td>Writes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.cost_client.blocked_requests</td><td>Number of requests currently blocked by the rate limiter</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.cross_region_network_ru</td><td>Total number of RUs charged for cross-region network traffic</td><td>Request Units</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.external_io_egress_bytes</td><td>Total number of bytes written to external services such as cloud storage providers</td><td>Bytes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.external_io_ingress_bytes</td><td>Total number of bytes read from external services such as cloud storage providers</td><td>Bytes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.kv_request_units</td><td>RU consumption attributable to KV</td><td>Request Units</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.pgwire_egress_bytes</td><td>Total number of bytes transferred from a SQL pod to the client</td><td>Bytes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.read_batches</td><td>Total number of KV read batches</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.read_bytes</td><td>Total number of bytes read from KV</td><td>Bytes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.read_requests</td><td>Total number of KV read requests</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.request_units</td><td>RU consumption</td><td>Request Units</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.sql_pods_cpu_seconds</td><td>Total amount of CPU used by SQL pods</td><td>CPU Seconds</td><td>COUNTER</td><td>SECONDS</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.write_batches</td><td>Total number of KV write batches</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.write_bytes</td><td>Total number of bytes written to KV</td><td>Bytes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>tenant.sql_usage.write_requests</td><td>Total number of KV write requests</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>txn.aborts</td><td>Number of aborted KV transactions</td><td>KV Transactions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>txn.commit_waits</td><td>Number of KV transactions that had to commit-wait on commit in order to ensure linearizability. This generally happens to transactions writing to global ranges.</td><td>KV Transactions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>txn.commits</td><td>Number of committed KV transactions (including 1PC)</td><td>KV Transactions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>

diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel
@@ -4,6 +4,7 @@
 ALL_TESTS = [
     "//pkg/acceptance:acceptance_test",
     "//pkg/base:base_test",
+    "//pkg/bench/hashbench:hashbench_test",
     "//pkg/bench/rttanalysis:rttanalysis_test",
     "//pkg/bench/tpcc:tpcc_test",
     "//pkg/bench:bench_test",
@@ -763,6 +764,7 @@ GO_TARGETS = [
     "//pkg/base:base_test",
     "//pkg/bench/cmd/pgbenchsetup:pgbenchsetup",
     "//pkg/bench/cmd/pgbenchsetup:pgbenchsetup_lib",
+    "//pkg/bench/hashbench:hashbench_test",
     "//pkg/bench/rttanalysis:rttanalysis",
     "//pkg/bench/rttanalysis:rttanalysis_test",
     "//pkg/bench/tpcc:tpcc",

diff --git a/pkg/bench/hashbench/BUILD.bazel b/pkg/bench/hashbench/BUILD.bazel
@@ -0,0 +1,6 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+go_test(
+    name = "hashbench_test",
+    srcs = ["hash_gc_test.go"],
+)
diff --git a/pkg/bench/hashbench/hash_gc_test.go b/pkg/bench/hashbench/hash_gc_test.go
@@ -0,0 +1,128 @@
+// Copyright 2023 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package hashbench
+
+import (
+	"crypto/md5"
+	"crypto/sha256"
+	"fmt"
+	"hash"
+	"runtime"
+	"testing"
+	"time"
+)
+
+// chunkingHasher wraps a hash but chunks up writes passed to it.
+type chunkingHash struct {
+	hash.Hash
+	chunkSize int64
+}
+
+var _ hash.Hash = chunkingHash{}
+
+func (c chunkingHash) Write(b []byte) (n int, _ error) {
+	for int64(len(b)) > c.chunkSize {
+		w, _ := c.Hash.Write(b[:c.chunkSize])
+		b = b[w:]
+		n += w
+	}
+	w, _ := c.Hash.Write(b)
+	return n + w, nil
+}
+
+// BenchmarkLatencyWhileHashing demonstrates the latency impact (in this case on
+// a loop that just sleeps) that hashing various sizes of blocks of bytes has in
+// the presence of GC, due to https://github.com/golang/go/issues/64417.
+func BenchmarkLatencyWhileHashing(b *testing.B) {
+	for _, hashImpl := range []struct {
+		name string
+		hash.Hash
+	}{{name: "MD5", Hash: md5.New()}, {name: "SHA256", Hash: sha256.New()}} {
+		for _, blockSizeKB := range []int64{0, 64, 128, 512, 1024, 2048, 4096} {
+			for _, fileSizeMB := range []int64{4, 8, 16, 32} {
+				blockSizeStr := fmt.Sprintf("%dKB", blockSizeKB)
+				if blockSizeKB == 0 {
+					blockSizeStr = "whole"
+				}
+				b.Run(fmt.Sprintf("hash=%s/block=%s/file=%dMB", hashImpl.name, blockSizeStr, fileSizeMB), func(b *testing.B) {
+					h, fileSize := hashImpl.Hash, fileSizeMB<<20
+					if blockSizeKB > 0 {
+						h = chunkingHash{Hash: h, chunkSize: blockSizeKB << 10}
+					}
+
+					stop, stoppedGC, stoppedHashing := make(chan struct{}), make(chan struct{}), make(chan struct{})
+
+					// Hammer GC, in a loop, for the duration of the test. These GCs may
+					// stop the world, which would contribute to the latency measured in
+					// the benchmark loop below.
+					go func() {
+						defer close(stoppedGC)
+						for {
+							select {
+							case <-stop:
+								return
+							default:
+								runtime.GC()
+							}
+						}
+					}()
+
+					// Do some hashing work for the duration of the test. These hashing
+					// calls will need to be stopped along with the rest of the world by
+					// the GC's being being run in the loop above; it they are slow to
+					// stop, they'd delay the GC stops and thus increase the observed
+					// latency in the benchmark loop.
+					go func() {
+						defer close(stoppedHashing)
+						for {
+							select {
+							case <-stop:
+								return
+							default:
+								if h != nil {
+									buf := make([]byte, fileSize)
+									for i := 0; i < 10; i++ {
+										h.Write(buf)
+										h.Sum(nil)
+									}
+								}
+							}
+						}
+					}()
+
+					b.ResetTimer()
+
+					// Measure worst-case latency one loop iteration to the next, with
+					// each iteration sleeping a fixed interval, while the GC and hashing
+					// loops above are also running.
+					var worst time.Duration
+					before := time.Now()
+					for i := 0; i < b.N; i++ {
+						time.Sleep(time.Microsecond * 100)
+						if d := time.Since(before); d > worst {
+							worst = d
+						}
+						before = time.Now()
+					}
+
+					b.StopTimer()
+					b.ReportMetric(float64(worst.Microseconds()), "max-latency")
+
+					// Stop the the two background loops and wait for them to exit before
+					// moving on to the next test-case.
+					close(stop)
+					<-stoppedGC
+					<-stoppedHashing
+				})
+			}
+		}
+	}
+}
diff --git a/pkg/ccl/multitenantccl/tenantcostclient/metrics.go b/pkg/ccl/multitenantccl/tenantcostclient/metrics.go
@@ -8,7 +8,10 @@
 
 package tenantcostclient
 
-import "github.com/cockroachdb/cockroach/pkg/util/metric"
+import (
+	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
+	"github.com/cockroachdb/cockroach/pkg/util/metric"
+)
 
 var (
 	metaCurrentBlocked = metric.Metadata{
@@ -17,14 +20,143 @@ var (
 		Measurement: "Requests",
 		Unit:        metric.Unit_COUNT,
 	}
+
+	// SQL usage related metrics.
+	metaTotalRU = metric.Metadata{
+		Name:        "tenant.sql_usage.request_units",
+		Help:        "RU consumption",
+		Measurement: "Request Units",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalKVRU = metric.Metadata{
+		Name:        "tenant.sql_usage.kv_request_units",
+		Help:        "RU consumption attributable to KV",
+		Measurement: "Request Units",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalReadBatches = metric.Metadata{
+		Name:        "tenant.sql_usage.read_batches",
+		Help:        "Total number of KV read batches",
+		Measurement: "Requests",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalReadRequests = metric.Metadata{
+		Name:        "tenant.sql_usage.read_requests",
+		Help:        "Total number of KV read requests",
+		Measurement: "Requests",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalReadBytes = metric.Metadata{
+		Name:        "tenant.sql_usage.read_bytes",
+		Help:        "Total number of bytes read from KV",
+		Measurement: "Bytes",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalWriteBatches = metric.Metadata{
+		Name:        "tenant.sql_usage.write_batches",
+		Help:        "Total number of KV write batches",
+		Measurement: "Requests",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalWriteRequests = metric.Metadata{
+		Name:        "tenant.sql_usage.write_requests",
+		Help:        "Total number of KV write requests",
+		Measurement: "Requests",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalWriteBytes = metric.Metadata{
+		Name:        "tenant.sql_usage.write_bytes",
+		Help:        "Total number of bytes written to KV",
+		Measurement: "Bytes",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalSQLPodsCPUSeconds = metric.Metadata{
+		Name:        "tenant.sql_usage.sql_pods_cpu_seconds",
+		Help:        "Total amount of CPU used by SQL pods",
+		Measurement: "CPU Seconds",
+		Unit:        metric.Unit_SECONDS,
+	}
+	metaTotalPGWireEgressBytes = metric.Metadata{
+		Name:        "tenant.sql_usage.pgwire_egress_bytes",
+		Help:        "Total number of bytes transferred from a SQL pod to the client",
+		Measurement: "Bytes",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalExternalIOIngressBytes = metric.Metadata{
+		Name:        "tenant.sql_usage.external_io_ingress_bytes",
+		Help:        "Total number of bytes read from external services such as cloud storage providers",
+		Measurement: "Bytes",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalExternalIOEgressBytes = metric.Metadata{
+		Name:        "tenant.sql_usage.external_io_egress_bytes",
+		Help:        "Total number of bytes written to external services such as cloud storage providers",
+		Measurement: "Bytes",
+		Unit:        metric.Unit_COUNT,
+	}
+	metaTotalCrossRegionNetworkRU = metric.Metadata{
+		Name:        "tenant.sql_usage.cross_region_network_ru",
+		Help:        "Total number of RUs charged for cross-region network traffic",
+		Measurement: "Request Units",
+		Unit:        metric.Unit_COUNT,
+	}
 )
 
 // metrics manage the metrics used by the tenant cost client.
 type metrics struct {
-	CurrentBlocked *metric.Gauge
+	CurrentBlocked              *metric.Gauge
+	TotalRU                     *metric.CounterFloat64
+	TotalKVRU                   *metric.CounterFloat64
+	TotalReadBatches            *metric.Counter
+	TotalReadRequests           *metric.Counter
+	TotalReadBytes              *metric.Counter
+	TotalWriteBatches           *metric.Counter
+	TotalWriteRequests          *metric.Counter
+	TotalWriteBytes             *metric.Counter
+	TotalSQLPodsCPUSeconds      *metric.CounterFloat64
+	TotalPGWireEgressBytes      *metric.Counter
+	TotalExternalIOEgressBytes  *metric.Counter
+	TotalExternalIOIngressBytes *metric.Counter
+	TotalCrossRegionNetworkRU   *metric.CounterFloat64
 }
 
+var _ metric.Struct = (*metrics)(nil)
+
+// MetricStruct indicates that Metrics is a metric.Struct.
+func (m *metrics) MetricStruct() {}
+
 // Init initializes the tenant cost client metrics.
 func (m *metrics) Init() {
 	m.CurrentBlocked = metric.NewGauge(metaCurrentBlocked)
+	m.TotalRU = metric.NewCounterFloat64(metaTotalRU)
+	m.TotalKVRU = metric.NewCounterFloat64(metaTotalKVRU)
+	m.TotalReadBatches = metric.NewCounter(metaTotalReadBatches)
+	m.TotalReadRequests = metric.NewCounter(metaTotalReadRequests)
+	m.TotalReadBytes = metric.NewCounter(metaTotalReadBytes)
+	m.TotalWriteBatches = metric.NewCounter(metaTotalWriteBatches)
+	m.TotalWriteRequests = metric.NewCounter(metaTotalWriteRequests)
+	m.TotalWriteBytes = metric.NewCounter(metaTotalWriteBytes)
+	m.TotalSQLPodsCPUSeconds = metric.NewCounterFloat64(metaTotalSQLPodsCPUSeconds)
+	m.TotalPGWireEgressBytes = metric.NewCounter(metaTotalPGWireEgressBytes)
+	m.TotalExternalIOEgressBytes = metric.NewCounter(metaTotalExternalIOEgressBytes)
+	m.TotalExternalIOIngressBytes = metric.NewCounter(metaTotalExternalIOIngressBytes)
+	m.TotalCrossRegionNetworkRU = metric.NewCounterFloat64(metaTotalCrossRegionNetworkRU)
+}
+
+// incrementConsumption updates consumption-related metrics with the delta
+// consumption.
+func (m *metrics) incrementConsumption(delta kvpb.TenantConsumption) {
+	m.TotalRU.Inc(delta.RU)
+	m.TotalKVRU.Inc(delta.KVRU)
+	m.TotalReadBatches.Inc(int64(delta.ReadBatches))
+	m.TotalReadRequests.Inc(int64(delta.ReadRequests))
+	m.TotalReadBytes.Inc(int64(delta.ReadBytes))
+	m.TotalWriteBatches.Inc(int64(delta.WriteBatches))
+	m.TotalWriteRequests.Inc(int64(delta.WriteRequests))
+	m.TotalWriteBytes.Inc(int64(delta.WriteBytes))
+	m.TotalSQLPodsCPUSeconds.Inc(delta.SQLPodsCPUSeconds)
+	m.TotalPGWireEgressBytes.Inc(int64(delta.PGWireEgressBytes))
+	m.TotalExternalIOEgressBytes.Inc(int64(delta.ExternalIOEgressBytes))
+	m.TotalExternalIOIngressBytes.Inc(int64(delta.ExternalIOIngressBytes))
+	m.TotalCrossRegionNetworkRU.Inc(delta.CrossRegionNetworkRU)
 }
diff --git a/pkg/ccl/multitenantccl/tenantcostclient/tenant_side.go b/pkg/ccl/multitenantccl/tenantcostclient/tenant_side.go
@@ -25,6 +25,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
 	"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/cockroach/pkg/util/metric"
 	"github.com/cockroachdb/cockroach/pkg/util/stop"
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
@@ -321,6 +322,9 @@ type tenantSideCostController struct {
 		// lastReportedConsumption is the set of tenant resource consumption
 		// metrics last sent to the token bucket server.
 		lastReportedConsumption kvpb.TenantConsumption
+		// lastExportedConsumption is the set of tenant resource consumption
+		// metrics last sent to the metrics registry.
+		lastExportedConsumption kvpb.TenantConsumption
 		// lastRate is the token bucket fill rate that was last configured.
 		lastRate float64
 
@@ -471,6 +475,13 @@ func (c *tenantSideCostController) onTick(ctx context.Context, newTime time.Time
 		c.run.fallbackRateStart = time.Time{}
 	}
 
+	// Report consumption metrics. Update local data first before sending a
+	// token bucket request to the KV servers.
+	deltaConsumption := c.run.consumption
+	deltaConsumption.Sub(&c.run.lastExportedConsumption)
+	c.run.lastExportedConsumption = c.run.consumption
+	c.metrics.incrementConsumption(deltaConsumption)
+
 	// Should a token bucket request be sent? It might be for a retry or for
 	// periodic consumption reporting.
 	if c.run.shouldSendRequest || c.shouldReportConsumption() {
@@ -893,3 +904,8 @@ func (c *tenantSideCostController) GetCPUMovingAvg() float64 {
 func (c *tenantSideCostController) GetCostConfig() *tenantcostmodel.Config {
 	return c.costCfg.Load()
 }
+
+// Metrics returns a metric.Struct which holds metrics for the controller.
+func (c *tenantSideCostController) Metrics() metric.Struct {
+	return &c.metrics
+}