From b118aaa972d462a842d34b19509a3e40efcfad5b Mon Sep 17 00:00:00 2001 From: Andrew Kimball Date: Fri, 1 Jan 2021 09:28:07 -0800 Subject: [PATCH] server: add diagnostics support for tenants Previously, SQL-only tenants did not report feature usage to the registration server. This commit adds that support via the creation of a new diagnostics package that encapsulates the usage report code in updates.go. However, since this commit needs to be backported to 20.2, the new code is *only* used with tenants, and is side-by-side with the existing code, which will continue to operate as before. A follow-on commit (that will not be backported) will fully replace the old implementation with the new. Release note (sql change): Multi-tenant clusters will now send anonymous usage information to the central CRDB registration server. --- pkg/base/test_server_args.go | 7 +- pkg/ccl/backupccl/backup_test.go | 22 +- pkg/ccl/backupccl/show_test.go | 2 +- pkg/ccl/serverccl/server_sql_test.go | 21 +- pkg/cli/mt_start_sql.go | 11 +- pkg/server/config.go | 6 - pkg/server/diagnostics/main_test.go | 27 ++ pkg/server/diagnostics/reporter.go | 415 +++++++++++++++++ pkg/server/diagnostics/reporter_test.go | 420 ++++++++++++++++++ pkg/server/server_sql.go | 39 +- pkg/server/testserver.go | 47 +- pkg/sql/exec_util.go | 4 + pkg/sql/logictest/logic.go | 7 +- pkg/sql/run_control_test.go | 2 +- pkg/sql/telemetry_test.go | 267 ++++++----- pkg/testutils/diagutils/diag_test_server.go | 6 + pkg/testutils/serverutils/test_server_shim.go | 43 +- pkg/testutils/serverutils/test_tenant_shim.go | 37 ++ pkg/testutils/testcluster/testcluster.go | 3 +- 19 files changed, 1224 insertions(+), 162 deletions(-) create mode 100644 pkg/server/diagnostics/main_test.go create mode 100644 pkg/server/diagnostics/reporter_test.go create mode 100644 pkg/testutils/serverutils/test_tenant_shim.go diff --git a/pkg/base/test_server_args.go b/pkg/base/test_server_args.go index e72db2084e34..5406baf35c53 100644 --- a/pkg/base/test_server_args.go +++ b/pkg/base/test_server_args.go @@ -216,11 +216,10 @@ type TestTenantArgs struct { // cluster settings. AllowSettingClusterSettings bool - // TenantIDCodecOverride overrides the tenant ID used to construct the SQL - // server's codec, but nothing else (e.g. its certs). Used for testing. - TenantIDCodecOverride roachpb.TenantID - // Stopper, if not nil, is used to stop the tenant manually otherwise the // TestServer stopper will be used. Stopper *stop.Stopper + + // TestingKnobs for the test server. + TestingKnobs TestingKnobs } diff --git a/pkg/ccl/backupccl/backup_test.go b/pkg/ccl/backupccl/backup_test.go index 4320051824b1..a449d824c891 100644 --- a/pkg/ccl/backupccl/backup_test.go +++ b/pkg/ccl/backupccl/backup_test.go @@ -3465,7 +3465,7 @@ func TestBackupTenantsWithRevisionHistory(t *testing.T) { ctx, tc, sqlDB, _, cleanupFn := BackupRestoreTestSetup(t, singleNode, numAccounts, InitNone) defer cleanupFn() - _, _, err := tc.Servers[0].StartTenant(base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) + _, err := tc.Servers[0].StartTenant(base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) require.NoError(t, err) const msg = "can not backup tenants with revision history" @@ -5819,17 +5819,17 @@ func TestBackupRestoreTenant(t *testing.T) { _ = security.EmbeddedTenantIDs() // Setup a few tenants, each with a different table. - conn10 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) + _, conn10 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) defer conn10.Close() tenant10 := sqlutils.MakeSQLRunner(conn10) tenant10.Exec(t, `CREATE DATABASE foo; CREATE TABLE foo.bar(i int primary key); INSERT INTO foo.bar VALUES (110), (210)`) - conn11 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(11)}) + _, conn11 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(11)}) defer conn11.Close() tenant11 := sqlutils.MakeSQLRunner(conn11) tenant11.Exec(t, `CREATE DATABASE foo; CREATE TABLE foo.baz(i int primary key); INSERT INTO foo.baz VALUES (111), (211)`) - conn20 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(20)}) + _, conn20 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(20)}) defer conn20.Close() tenant20 := sqlutils.MakeSQLRunner(conn20) tenant20.Exec(t, `CREATE DATABASE foo; CREATE TABLE foo.qux(i int primary key); INSERT INTO foo.qux VALUES (120), (220)`) @@ -5881,7 +5881,7 @@ func TestBackupRestoreTenant(t *testing.T) { ) ten10Stopper := stop.NewStopper() - restoreConn10 := serverutils.StartTenant( + _, restoreConn10 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{ TenantID: roachpb.MakeTenantID(10), Existing: true, Stopper: ten10Stopper, }, @@ -5914,7 +5914,7 @@ func TestBackupRestoreTenant(t *testing.T) { [][]string{{`10`, `true`, `{"id": "10", "state": "ACTIVE"}`}}, ) - restoreConn10 = serverutils.StartTenant( + _, restoreConn10 = serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), Existing: true}, ) defer restoreConn10.Close() @@ -5938,7 +5938,7 @@ func TestBackupRestoreTenant(t *testing.T) { [][]string{{`10`, `true`, `{"id": "10", "state": "ACTIVE"}`}}, ) - restoreConn10 := serverutils.StartTenant( + _, restoreConn10 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), Existing: true}, ) defer restoreConn10.Close() @@ -5967,7 +5967,7 @@ func TestBackupRestoreTenant(t *testing.T) { }, ) - restoreConn10 := serverutils.StartTenant( + _, restoreConn10 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), Existing: true}, ) defer restoreConn10.Close() @@ -5976,7 +5976,7 @@ func TestBackupRestoreTenant(t *testing.T) { restoreTenant10.CheckQueryResults(t, `select * from foo.bar`, tenant10.QueryStr(t, `select * from foo.bar`)) restoreTenant10.CheckQueryResults(t, `select * from foo.bar2`, tenant10.QueryStr(t, `select * from foo.bar2`)) - restoreConn11 := serverutils.StartTenant( + _, restoreConn11 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(11), Existing: true}, ) defer restoreConn11.Close() @@ -5994,7 +5994,7 @@ func TestBackupRestoreTenant(t *testing.T) { restoreDB.Exec(t, `RESTORE TENANT 10 FROM 'nodelocal://1/t10' AS OF SYSTEM TIME `+ts1) - restoreConn10 := serverutils.StartTenant( + _, restoreConn10 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), Existing: true}, ) defer restoreConn10.Close() @@ -6012,7 +6012,7 @@ func TestBackupRestoreTenant(t *testing.T) { restoreDB.Exec(t, `RESTORE TENANT 20 FROM 'nodelocal://1/t20'`) - restoreConn20 := serverutils.StartTenant( + _, restoreConn20 := serverutils.StartTenant( t, restoreTC.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(20), Existing: true}, ) defer restoreConn20.Close() diff --git a/pkg/ccl/backupccl/show_test.go b/pkg/ccl/backupccl/show_test.go index f80b6d93f8e6..c3d15c077b1b 100644 --- a/pkg/ccl/backupccl/show_test.go +++ b/pkg/ccl/backupccl/show_test.go @@ -418,7 +418,7 @@ func TestShowBackupTenants(t *testing.T) { // NB: tenant certs for 10, 11, 20 are embedded. See: _ = security.EmbeddedTenantIDs() - conn10 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) + _, conn10 := serverutils.StartTenant(t, srv, base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) defer conn10.Close() tenant10 := sqlutils.MakeSQLRunner(conn10) tenant10.Exec(t, `CREATE DATABASE foo; CREATE TABLE foo.bar(i int primary key); INSERT INTO foo.bar VALUES (110), (210)`) diff --git a/pkg/ccl/serverccl/server_sql_test.go b/pkg/ccl/serverccl/server_sql_test.go index ca96a7199f8a..4fd914ea2906 100644 --- a/pkg/ccl/serverccl/server_sql_test.go +++ b/pkg/ccl/serverccl/server_sql_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/sql" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" @@ -46,7 +47,7 @@ func TestSQLServer(t *testing.T) { tc := serverutils.StartNewTestCluster(t, 3, base.TestClusterArgs{}) defer tc.Stopper().Stop(ctx) - db := serverutils.StartTenant( + _, db := serverutils.StartTenant( t, tc.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[0])}, @@ -72,7 +73,7 @@ func TestTenantCannotSetClusterSetting(t *testing.T) { defer tc.Stopper().Stop(ctx) // StartTenant with the default permissions to - db := serverutils.StartTenant(t, tc.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), AllowSettingClusterSettings: false}) + _, db := serverutils.StartTenant(t, tc.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), AllowSettingClusterSettings: false}) defer db.Close() _, err := db.Exec(`SET CLUSTER SETTING sql.defaults.vectorize=off`) var pqErr *pq.Error @@ -89,10 +90,14 @@ func TestTenantUnauthenticatedAccess(t *testing.T) { tc := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{}) defer tc.Stopper().Stop(ctx) - _, _, err := tc.Server(0).StartTenant(base.TestTenantArgs{ + _, err := tc.Server(0).StartTenant(base.TestTenantArgs{ TenantID: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[0]), - // Configure the SQL server to access the wrong tenant keyspace. - TenantIDCodecOverride: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[1]), + TestingKnobs: base.TestingKnobs{ + TenantTestingKnobs: &sql.TenantTestingKnobs{ + // Configure the SQL server to access the wrong tenant keyspace. + TenantIDCodecOverride: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[1]), + }, + }, }) require.Error(t, err) require.Regexp(t, `Unauthenticated desc = requested key /Tenant/11/System/"system-version/" not fully contained in tenant keyspace /Tenant/1{0-1}`, err) @@ -107,12 +112,12 @@ func TestTenantHTTP(t *testing.T) { tc := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{}) defer tc.Stopper().Stop(ctx) - _, httpAddr, err := tc.Server(0).StartTenant(base.TestTenantArgs{ + tenant, err := tc.Server(0).StartTenant(base.TestTenantArgs{ TenantID: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[0]), }) require.NoError(t, err) t.Run("prometheus", func(t *testing.T) { - resp, err := httputil.Get(ctx, "http://"+httpAddr+"/_status/vars") + resp, err := httputil.Get(ctx, "http://"+tenant.HTTPAddr()+"/_status/vars") defer http.DefaultClient.CloseIdleConnections() require.NoError(t, err) defer resp.Body.Close() @@ -121,7 +126,7 @@ func TestTenantHTTP(t *testing.T) { require.Contains(t, string(body), "sql_ddl_started_count_internal") }) t.Run("pprof", func(t *testing.T) { - resp, err := httputil.Get(ctx, "http://"+httpAddr+"/debug/pprof/goroutine?debug=2") + resp, err := httputil.Get(ctx, "http://"+tenant.HTTPAddr()+"/debug/pprof/goroutine?debug=2") defer http.DefaultClient.CloseIdleConnections() require.NoError(t, err) defer resp.Body.Close() diff --git a/pkg/cli/mt_start_sql.go b/pkg/cli/mt_start_sql.go index 664c5a292001..469005d8e439 100644 --- a/pkg/cli/mt_start_sql.go +++ b/pkg/cli/mt_start_sql.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" "github.com/spf13/cobra" @@ -97,7 +98,7 @@ func runStartSQL(cmd *cobra.Command, args []string) error { tempStorageMaxSizeBytes, ) - _, addr, httpAddr, err := server.StartTenant( + sqlServer, addr, httpAddr, err := server.StartTenant( ctx, stopper, clusterName, @@ -107,6 +108,14 @@ func runStartSQL(cmd *cobra.Command, args []string) error { if err != nil { return err } + + // Start up the diagnostics reporting loop. + // We don't do this in (*server.SQLServer).start() because we don't + // want this overhead and possible interference in tests. + if !cluster.TelemetryOptOut() { + sqlServer.StartDiagnostics(ctx) + } + log.Infof(ctx, "SQL server for tenant %s listening at %s, http at %s", serverCfg.SQLConfig.TenantID, addr, httpAddr) // TODO(tbg): make the other goodies in `./cockroach start` reusable, such as diff --git a/pkg/server/config.go b/pkg/server/config.go index e417e2af8cb1..532d0956730b 100644 --- a/pkg/server/config.go +++ b/pkg/server/config.go @@ -334,12 +334,6 @@ type SQLConfig struct { // // Only applies when the SQL server is deployed individually. TenantKVAddrs []string - - // TenantIDCodecOverride overrides the tenant ID used to construct the SQL - // server's codec, but nothing else (e.g. its certs). Used for testing. - // - // Only applies when the SQL server is deployed individually. - TenantIDCodecOverride roachpb.TenantID } // MakeSQLConfig returns a SQLConfig with default values. diff --git a/pkg/server/diagnostics/main_test.go b/pkg/server/diagnostics/main_test.go new file mode 100644 index 000000000000..48d8f7ce7cfe --- /dev/null +++ b/pkg/server/diagnostics/main_test.go @@ -0,0 +1,27 @@ +// Copyright 2015 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package diagnostics_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" +) + +func TestMain(m *testing.M) { + security.SetAssetLoader(securitytest.EmbeddedAssets) + serverutils.InitTestServerFactory(server.TestServerFactory) + os.Exit(m.Run()) +} diff --git a/pkg/server/diagnostics/reporter.go b/pkg/server/diagnostics/reporter.go index c66744179e84..a3da44d365cd 100644 --- a/pkg/server/diagnostics/reporter.go +++ b/pkg/server/diagnostics/reporter.go @@ -11,9 +11,45 @@ package diagnostics import ( + "bytes" + "context" + "io/ioutil" + "math/rand" + "net/http" + "reflect" + "runtime" "time" + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/build" + "github.com/cockroachdb/cockroach/pkg/config/zonepb" + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/diagnosticspb" + "github.com/cockroachdb/cockroach/pkg/server/status" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" + "github.com/cockroachdb/cockroach/pkg/util/cloudinfo" + "github.com/cockroachdb/cockroach/pkg/util/httputil" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/cockroachdb/errors" + "github.com/gogo/protobuf/proto" + "github.com/mitchellh/reflectwalk" + "github.com/shirou/gopsutil/cpu" + "github.com/shirou/gopsutil/host" + "github.com/shirou/gopsutil/load" + "github.com/shirou/gopsutil/mem" ) // ReportFrequency is the interval at which diagnostics data should be reported. @@ -22,3 +58,382 @@ var ReportFrequency = settings.RegisterPublicNonNegativeDurationSetting( "interval at which diagnostics data should be reported", time.Hour, ) + +const updateCheckJitterSeconds = 120 + +// Reporter is a helper struct that phones home to report usage and diagnostics. +type Reporter struct { + StartTime time.Time + AmbientCtx *log.AmbientContext + Config *base.Config + Settings *cluster.Settings + + // ClusterID is not yet available at the time the reporter is created, so + // instead initialize with a function that gets it dynamically. + ClusterID func() uuid.UUID + TenantID roachpb.TenantID + + // SQLInstanceID is not yet available at the time the reporter is created, + // so instead initialize with a function that gets it dynamically. + SQLInstanceID func() base.SQLInstanceID + SQLServer *sql.Server + InternalExec *sql.InternalExecutor + DB *kv.DB + Recorder *status.MetricsRecorder + + // Locality is a description of the topography of the server. + Locality roachpb.Locality + + // TestingKnobs is used for internal test controls only. + TestingKnobs *diagnosticspb.TestingKnobs +} + +// PeriodicallyReportDiagnostics calls ReportDiagnostics on a regular schedule. +func (r *Reporter) PeriodicallyReportDiagnostics(ctx context.Context, stopper *stop.Stopper) { + stopper.RunWorker(ctx, func(ctx context.Context) { + defer log.RecoverAndReportNonfatalPanic(ctx, &r.Settings.SV) + nextReport := r.StartTime + + var timer timeutil.Timer + defer timer.Stop() + for { + // TODO(dt): we should allow tuning the reset and report intervals separately. + // Consider something like rand.Float() > resetFreq/reportFreq here to sample + // stat reset periods for reporting. + // Always report diagnostics + // 1. In 20.2, this Reporter code only runs for tenants. + // 2. The diagnostics reporting cluster setting is disabled in + // tenant clusters. + // 3. Tenant cluster settings cannot be changed in 20.2. + r.ReportDiagnostics(ctx) + + nextReport = nextReport.Add(ReportFrequency.Get(&r.Settings.SV)) + + timer.Reset(addJitter(nextReport.Sub(timeutil.Now()), updateCheckJitterSeconds)) + select { + case <-stopper.ShouldQuiesce(): + return + case <-timer.C: + timer.Read = true + } + } + }) +} + +// ReportDiagnostics phones home to report usage and diagnostics. +// +// NOTE: This can be slow because of cloud detection; use cloudinfo.Disable() in +// tests to avoid that. +func (r *Reporter) ReportDiagnostics(ctx context.Context) { + ctx, span := r.AmbientCtx.AnnotateCtxWithSpan(ctx, "usageReport") + defer span.Finish() + + report := r.getReportingInfo(ctx, telemetry.ResetCounts) + + clusterInfo := diagnosticspb.ClusterInfo{ + ClusterID: r.ClusterID(), + TenantID: r.TenantID, + IsInsecure: r.Config.Insecure, + IsInternal: sql.ClusterIsInternal(&r.Settings.SV), + } + reportingURL := diagnosticspb.BuildReportingURL(&clusterInfo, report, r.TestingKnobs) + if reportingURL == nil { + return + } + + b, err := protoutil.Marshal(report) + if err != nil { + log.Warningf(ctx, "%v", err) + return + } + + res, err := httputil.Post( + ctx, reportingURL.String(), "application/x-protobuf", bytes.NewReader(b), + ) + if err != nil { + if log.V(2) { + // This is probably going to be relatively common in production + // environments where network access is usually curtailed. + log.Warningf(ctx, "failed to report node usage metrics: %v", err) + } + return + } + defer res.Body.Close() + b, err = ioutil.ReadAll(res.Body) + if err != nil || res.StatusCode != http.StatusOK { + log.Warningf(ctx, "failed to report node usage metrics: status: %s, body: %s, "+ + "error: %v", res.Status, b, err) + return + } + r.SQLServer.ResetReportedStats(ctx) +} + +func (r *Reporter) getReportingInfo( + ctx context.Context, reset telemetry.ResetCounters, +) *diagnosticspb.DiagnosticReport { + info := diagnosticspb.DiagnosticReport{} + secret := sql.ClusterSecret.Get(&r.Settings.SV) + uptime := int64(timeutil.Now().Sub(r.StartTime).Seconds()) + + // Populate the hardware, OS, binary, and location of the CRDB node or SQL + // instance. + r.populateEnvironment(ctx, secret, &info.Env) + + // Always populate SQL info, since even full CRDB running KV will also be + // running SQL. + r.populateSQLInfo(uptime, &info.SQL) + + // Do not collect node or store information for tenant reports. + if r.TenantID == roachpb.SystemTenantID { + r.populateNodeInfo(ctx, uptime, &info) + } + + schema, err := r.collectSchemaInfo(ctx) + if err != nil { + log.Warningf(ctx, "error collecting schema info for diagnostic report: %+v", err) + schema = nil + } + info.Schema = schema + + info.FeatureUsage = telemetry.GetFeatureCounts(telemetry.Quantized, reset) + + // Read the system.settings table to determine the settings for which we have + // explicitly set values -- the in-memory SV has the set and default values + // flattened for quick reads, but we'd rather only report the non-defaults. + if datums, err := r.InternalExec.QueryEx( + ctx, "read-setting", nil, /* txn */ + sessiondata.InternalExecutorOverride{User: security.RootUser}, + "SELECT name FROM system.settings", + ); err != nil { + log.Warningf(ctx, "failed to read settings: %s", err) + } else { + info.AlteredSettings = make(map[string]string, len(datums)) + for _, row := range datums { + name := string(tree.MustBeDString(row[0])) + info.AlteredSettings[name] = settings.RedactedValue(name, &r.Settings.SV) + } + } + + if datums, err := r.InternalExec.QueryEx( + ctx, + "read-zone-configs", + nil, /* txn */ + sessiondata.InternalExecutorOverride{User: security.RootUser}, + "SELECT id, config FROM system.zones", + ); err != nil { + log.Warningf(ctx, "%v", err) + } else { + info.ZoneConfigs = make(map[int64]zonepb.ZoneConfig) + for _, row := range datums { + id := int64(tree.MustBeDInt(row[0])) + var zone zonepb.ZoneConfig + if bytes, ok := row[1].(*tree.DBytes); !ok { + continue + } else { + if err := protoutil.Unmarshal([]byte(*bytes), &zone); err != nil { + log.Warningf(ctx, "unable to parse zone config %d: %v", id, err) + continue + } + } + var anonymizedZone zonepb.ZoneConfig + anonymizeZoneConfig(&anonymizedZone, zone, secret) + info.ZoneConfigs[id] = anonymizedZone + } + } + + info.SqlStats = r.SQLServer.GetScrubbedReportingStats() + return &info +} + +// populateEnvironment fills fields in the given environment, such as the +// hardware, OS, binary, and location of the CRDB node or SQL instance. +func (r *Reporter) populateEnvironment( + ctx context.Context, secret string, env *diagnosticspb.Environment, +) { + env.Build = build.GetInfo() + env.LicenseType = getLicenseType(ctx, r.Settings) + populateHardwareInfo(ctx, env) + + // Add in the localities. + for _, tier := range r.Locality.Tiers { + env.Locality.Tiers = append(env.Locality.Tiers, roachpb.Tier{ + Key: sql.HashForReporting(secret, tier.Key), + Value: sql.HashForReporting(secret, tier.Value), + }) + } +} + +// populateNodeInfo populates the NodeInfo and StoreInfo fields of the +// diagnostics report. +func (r *Reporter) populateNodeInfo( + ctx context.Context, uptime int64, info *diagnosticspb.DiagnosticReport, +) { + n := r.Recorder.GenerateNodeStatus(ctx) + info.Node.NodeID = n.Desc.NodeID + info.Node.Uptime = uptime + + info.Stores = make([]diagnosticspb.StoreInfo, len(n.StoreStatuses)) + for i, r := range n.StoreStatuses { + info.Stores[i].NodeID = r.Desc.Node.NodeID + info.Stores[i].StoreID = r.Desc.StoreID + info.Stores[i].KeyCount = int64(r.Metrics["keycount"]) + info.Stores[i].Capacity = int64(r.Metrics["capacity"]) + info.Stores[i].Available = int64(r.Metrics["capacity.available"]) + info.Stores[i].Used = int64(r.Metrics["capacity.used"]) + info.Node.KeyCount += info.Stores[i].KeyCount + info.Stores[i].RangeCount = int64(r.Metrics["replicas"]) + info.Node.RangeCount += info.Stores[i].RangeCount + bytes := int64(r.Metrics["sysbytes"] + r.Metrics["intentbytes"] + r.Metrics["valbytes"] + r.Metrics["keybytes"]) + info.Stores[i].Bytes = bytes + info.Node.Bytes += bytes + info.Stores[i].EncryptionAlgorithm = int64(r.Metrics["rocksdb.encryption.algorithm"]) + } + + // Fill in all deprecated NodeInfo fields with information that is now in + // other parts of the diagnostics report. + // TODO(andyk): Remove this code once the registration server gets this + // information from the other fields. + info.Node.Locality = info.Env.Locality + info.Node.Hardware = info.Env.Hardware + info.Node.Os = info.Env.Os + info.Node.Build = info.Env.Build + info.Node.LicenseType = info.Env.LicenseType + info.Node.Topology = info.Env.Topology +} + +func (r *Reporter) populateSQLInfo(uptime int64, sql *diagnosticspb.SQLInstanceInfo) { + sql.SQLInstanceID = r.SQLInstanceID() + sql.Uptime = uptime +} + +func (r *Reporter) collectSchemaInfo(ctx context.Context) ([]descpb.TableDescriptor, error) { + startKey := keys.MakeSQLCodec(r.TenantID).TablePrefix(keys.DescriptorTableID) + endKey := startKey.PrefixEnd() + kvs, err := r.DB.Scan(ctx, startKey, endKey, 0) + if err != nil { + return nil, err + } + tables := make([]descpb.TableDescriptor, 0, len(kvs)) + redactor := stringRedactor{} + for _, kv := range kvs { + var desc descpb.Descriptor + if err := kv.ValueProto(&desc); err != nil { + return nil, errors.Wrapf(err, "%s: unable to unmarshal SQL descriptor", kv.Key) + } + if t := descpb.TableFromDescriptor(&desc, kv.Value.Timestamp); t != nil && t.ID > keys.MaxReservedDescID { + if err := reflectwalk.Walk(t, redactor); err != nil { + panic(err) // stringRedactor never returns a non-nil err + } + tables = append(tables, *t) + } + } + return tables, nil +} + +func getLicenseType(ctx context.Context, settings *cluster.Settings) string { + licenseType, err := base.LicenseType(settings) + if err != nil { + log.Errorf(ctx, "error retrieving license type: %s", err) + return "" + } + return licenseType +} + +// populateHardwareInfo populates OS, CPU, memory, etc. information about the +// environment in which CRDB is running. +func populateHardwareInfo(ctx context.Context, e *diagnosticspb.Environment) { + if platform, family, version, err := host.PlatformInformation(); err == nil { + e.Os.Family = family + e.Os.Platform = platform + e.Os.Version = version + } + + if virt, role, err := host.Virtualization(); err == nil && role == "guest" { + e.Hardware.Virtualization = virt + } + + if m, err := mem.VirtualMemory(); err == nil { + e.Hardware.Mem.Available = m.Available + e.Hardware.Mem.Total = m.Total + } + + e.Hardware.Cpu.Numcpu = int32(runtime.NumCPU()) + if cpus, err := cpu.InfoWithContext(ctx); err == nil && len(cpus) > 0 { + e.Hardware.Cpu.Sockets = int32(len(cpus)) + c := cpus[0] + e.Hardware.Cpu.Cores = c.Cores + e.Hardware.Cpu.Model = c.ModelName + e.Hardware.Cpu.Mhz = float32(c.Mhz) + e.Hardware.Cpu.Features = c.Flags + } + + if l, err := load.AvgWithContext(ctx); err == nil { + e.Hardware.Loadavg15 = float32(l.Load15) + } + + e.Hardware.Provider, e.Hardware.InstanceClass = cloudinfo.GetInstanceClass(ctx) + e.Topology.Provider, e.Topology.Region = cloudinfo.GetInstanceRegion(ctx) +} + +func anonymizeZoneConfig(dst *zonepb.ZoneConfig, src zonepb.ZoneConfig, secret string) { + if src.RangeMinBytes != nil { + dst.RangeMinBytes = proto.Int64(*src.RangeMinBytes) + } + if src.RangeMaxBytes != nil { + dst.RangeMaxBytes = proto.Int64(*src.RangeMaxBytes) + } + if src.GC != nil { + dst.GC = &zonepb.GCPolicy{TTLSeconds: src.GC.TTLSeconds} + } + if src.NumReplicas != nil { + dst.NumReplicas = proto.Int32(*src.NumReplicas) + } + dst.Constraints = make([]zonepb.ConstraintsConjunction, len(src.Constraints)) + for i := range src.Constraints { + dst.Constraints[i].NumReplicas = src.Constraints[i].NumReplicas + dst.Constraints[i].Constraints = make([]zonepb.Constraint, len(src.Constraints[i].Constraints)) + for j := range src.Constraints[i].Constraints { + dst.Constraints[i].Constraints[j].Type = src.Constraints[i].Constraints[j].Type + if key := src.Constraints[i].Constraints[j].Key; key != "" { + dst.Constraints[i].Constraints[j].Key = sql.HashForReporting(secret, key) + } + if val := src.Constraints[i].Constraints[j].Value; val != "" { + dst.Constraints[i].Constraints[j].Value = sql.HashForReporting(secret, val) + } + } + } + dst.LeasePreferences = make([]zonepb.LeasePreference, len(src.LeasePreferences)) + for i := range src.LeasePreferences { + dst.LeasePreferences[i].Constraints = make([]zonepb.Constraint, len(src.LeasePreferences[i].Constraints)) + for j := range src.LeasePreferences[i].Constraints { + dst.LeasePreferences[i].Constraints[j].Type = src.LeasePreferences[i].Constraints[j].Type + if key := src.LeasePreferences[i].Constraints[j].Key; key != "" { + dst.LeasePreferences[i].Constraints[j].Key = sql.HashForReporting(secret, key) + } + if val := src.LeasePreferences[i].Constraints[j].Value; val != "" { + dst.LeasePreferences[i].Constraints[j].Value = sql.HashForReporting(secret, val) + } + } + } + dst.Subzones = make([]zonepb.Subzone, len(src.Subzones)) + for i := range src.Subzones { + dst.Subzones[i].IndexID = src.Subzones[i].IndexID + dst.Subzones[i].PartitionName = sql.HashForReporting(secret, src.Subzones[i].PartitionName) + anonymizeZoneConfig(&dst.Subzones[i].Config, src.Subzones[i].Config, secret) + } +} + +type stringRedactor struct{} + +func (stringRedactor) Primitive(v reflect.Value) error { + if v.Kind() == reflect.String && v.String() != "" { + v.Set(reflect.ValueOf("_").Convert(v.Type())) + } + return nil +} + +// randomly shift `d` to be up to `jitterSec` shorter or longer. +func addJitter(d time.Duration, jitterSec int) time.Duration { + j := time.Duration(rand.Intn(jitterSec*2)-jitterSec) * time.Second + return d + j +} diff --git a/pkg/server/diagnostics/reporter_test.go b/pkg/server/diagnostics/reporter_test.go new file mode 100644 index 000000000000..0571808c8c0c --- /dev/null +++ b/pkg/server/diagnostics/reporter_test.go @@ -0,0 +1,420 @@ +// Copyright 2016 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package diagnostics_test + +import ( + "context" + gosql "database/sql" + "fmt" + "runtime" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl" + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/config/zonepb" + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/server/diagnostics" + "github.com/cockroachdb/cockroach/pkg/server/diagnosticspb" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/diagutils" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util/cloudinfo" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/errors" + "github.com/stretchr/testify/require" +) + +// Dummy import to pull in kvtenantccl. This allows us to start tenants. +var _ = kvtenantccl.Connector{} + +const elemName = "somestring" + +func TestTenantReport(t *testing.T) { + defer leaktest.AfterTest(t) + defer log.Scope(t).Close(t) + + rt := startReporterTest(t) + defer rt.Close() + + tenantArgs := base.TestTenantArgs{ + TenantID: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[0]), + AllowSettingClusterSettings: true, + TestingKnobs: rt.testingKnobs, + } + tenant, tenantDB := serverutils.StartTenant(t, rt.server, tenantArgs) + reporter := tenant.DiagnosticsReporter().(*diagnostics.Reporter) + + ctx := context.Background() + setupCluster(t, tenantDB) + + // Clear the SQL stat pool before getting diagnostics. + rt.server.SQLServer().(*sql.Server).ResetSQLStats(ctx) + reporter.ReportDiagnostics(ctx) + + require.Equal(t, 1, rt.diagServer.NumRequests()) + + last := rt.diagServer.LastRequestData() + require.Equal(t, rt.server.ClusterID().String(), last.UUID) + require.Equal(t, tenantArgs.TenantID.String(), last.TenantID) + require.Equal(t, "", last.NodeID) + require.Equal(t, tenant.SQLInstanceID().String(), last.SQLInstanceID) + require.Equal(t, "true", last.Internal) + + // Verify environment. + verifyEnvironment(t, "", roachpb.Locality{}, &last.Env) + + // Verify SQL info. + require.Equal(t, tenant.SQLInstanceID(), last.SQL.SQLInstanceID) + + // Verify FeatureUsage. + require.NotZero(t, len(last.FeatureUsage)) + + // Call PeriodicallyReportDiagnostics and ensure it sends out a report. + reporter.PeriodicallyReportDiagnostics(ctx, rt.server.Stopper()) + testutils.SucceedsSoon(t, func() error { + if rt.diagServer.NumRequests() != 2 { + return errors.Errorf("did not receive a diagnostics report") + } + return nil + }) +} + +// TestServerReport checks nodes, stores, localities, and zone configs. +// Telemetry metrics are checked in datadriven tests (see sql.TestTelemetry). +func TestServerReport(t *testing.T) { + defer leaktest.AfterTest(t) + defer log.Scope(t).Close(t) + + rt := startReporterTest(t) + defer rt.Close() + + ctx := context.Background() + setupCluster(t, rt.serverDB) + + for _, cmd := range []struct { + resource string + config string + }{ + {"TABLE system.rangelog", fmt.Sprintf(`constraints: [+zone=%[1]s, +%[1]s]`, elemName)}, + {"TABLE system.rangelog", `{gc: {ttlseconds: 1}}`}, + {"DATABASE system", `num_replicas: 5`}, + {"DATABASE system", fmt.Sprintf(`constraints: {"+zone=%[1]s,+%[1]s": 2, +%[1]s: 1}`, elemName)}, + {"DATABASE system", fmt.Sprintf(`experimental_lease_preferences: [[+zone=%[1]s,+%[1]s], [+%[1]s]]`, elemName)}, + } { + testutils.SucceedsSoon(t, func() error { + if _, err := rt.serverDB.Exec( + fmt.Sprintf(`ALTER %s CONFIGURE ZONE = '%s'`, cmd.resource, cmd.config), + ); err != nil { + // Work around gossip asynchronicity. + return errors.Errorf("error applying zone config %q to %q: %v", cmd.config, cmd.resource, err) + } + return nil + }) + } + + expectedUsageReports := 0 + + clusterSecret := sql.ClusterSecret.Get(&rt.settings.SV) + testutils.SucceedsSoon(t, func() error { + expectedUsageReports++ + + node := rt.server.MetricsRecorder().GenerateNodeStatus(ctx) + // Clear the SQL stat pool before getting diagnostics. + rt.server.SQLServer().(*sql.Server).ResetSQLStats(ctx) + rt.server.ReportDiagnostics(ctx) + + keyCounts := make(map[roachpb.StoreID]int64) + rangeCounts := make(map[roachpb.StoreID]int64) + totalKeys := int64(0) + totalRanges := int64(0) + + for _, store := range node.StoreStatuses { + keys, ok := store.Metrics["keycount"] + require.True(t, ok, "keycount not in metrics") + totalKeys += int64(keys) + keyCounts[store.Desc.StoreID] = int64(keys) + + replicas, ok := store.Metrics["replicas"] + require.True(t, ok, "replicas not in metrics") + totalRanges += int64(replicas) + rangeCounts[store.Desc.StoreID] = int64(replicas) + } + + require.Equal(t, expectedUsageReports, rt.diagServer.NumRequests()) + + last := rt.diagServer.LastRequestData() + if expected, actual := rt.server.ClusterID().String(), last.UUID; expected != actual { + return errors.Errorf("expected cluster id %v got %v", expected, actual) + } + if expected, actual := "system", last.TenantID; expected != actual { + return errors.Errorf("expected tenant id %v got %v", expected, actual) + } + if expected, actual := rt.server.NodeID().String(), last.NodeID; expected != actual { + return errors.Errorf("expected node id %v got %v", expected, actual) + } + if expected, actual := rt.server.NodeID().String(), last.SQLInstanceID; expected != actual { + return errors.Errorf("expected sql instance id %v got %v", expected, actual) + } + if expected, actual := rt.server.NodeID(), last.Node.NodeID; expected != actual { + return errors.Errorf("expected node id %v got %v", expected, actual) + } + + if last.Node.Hardware.Mem.Total == 0 { + return errors.Errorf("expected non-zero total mem") + } + if last.Node.Hardware.Mem.Available == 0 { + return errors.Errorf("expected non-zero available mem") + } + if actual, expected := last.Node.Hardware.Cpu.Numcpu, runtime.NumCPU(); int(actual) != expected { + return errors.Errorf("expected %d num cpu, got %d", expected, actual) + } + if last.Node.Hardware.Cpu.Sockets == 0 { + return errors.Errorf("expected non-zero sockets") + } + if last.Node.Hardware.Cpu.Mhz == 0.0 { + return errors.Errorf("expected non-zero speed") + } + if last.Node.Os.Platform == "" { + return errors.Errorf("expected non-empty OS") + } + + if minExpected, actual := totalKeys, last.Node.KeyCount; minExpected > actual { + return errors.Errorf("expected node keys at least %v got %v", minExpected, actual) + } + if minExpected, actual := totalRanges, last.Node.RangeCount; minExpected > actual { + return errors.Errorf("expected node ranges at least %v got %v", minExpected, actual) + } + if minExpected, actual := len(rt.serverArgs.StoreSpecs), len(last.Stores); minExpected > actual { + return errors.Errorf("expected at least %v stores got %v", minExpected, actual) + } + if expected, actual := "true", last.Internal; expected != actual { + return errors.Errorf("expected internal to be %v, got %v", expected, actual) + } + if expected, actual := len(rt.serverArgs.Locality.Tiers), len(last.Node.Locality.Tiers); expected != actual { + return errors.Errorf("expected locality to have %d tier, got %d", expected, actual) + } + for i := range rt.serverArgs.Locality.Tiers { + if expected, actual := sql.HashForReporting(clusterSecret, rt.serverArgs.Locality.Tiers[i].Key), + last.Node.Locality.Tiers[i].Key; expected != actual { + return errors.Errorf("expected locality tier %d key to be %s, got %s", i, expected, actual) + } + if expected, actual := sql.HashForReporting(clusterSecret, rt.serverArgs.Locality.Tiers[i].Value), + last.Node.Locality.Tiers[i].Value; expected != actual { + return errors.Errorf("expected locality tier %d value to be %s, got %s", i, expected, actual) + } + } + + for _, store := range last.Stores { + if minExpected, actual := keyCounts[store.StoreID], store.KeyCount; minExpected > actual { + return errors.Errorf("expected at least %v keys in store %v got %v", minExpected, store.StoreID, actual) + } + if minExpected, actual := rangeCounts[store.StoreID], store.RangeCount; minExpected > actual { + return errors.Errorf("expected at least %v ranges in store %v got %v", minExpected, store.StoreID, actual) + } + } + return nil + }) + + last := rt.diagServer.LastRequestData() + // This check isn't clean, since the body is a raw proto binary and thus could + // easily contain some encoded form of elemName, but *if* it ever does fail, + // that is probably very interesting. + require.NotContains(t, last.RawReportBody, elemName) + + // 3 + 3 = 6: set 3 initially and org is set mid-test for 3 altered settings, + // plus version, reporting and secret settings are set in startup + // migrations. + expected, actual := 6, len(last.AlteredSettings) + require.Equal(t, expected, actual, "expected %d changed settings, got %d: %v", expected, actual, last.AlteredSettings) + + for key, expected := range map[string]string{ + "cluster.organization": "", + "diagnostics.reporting.send_crash_reports": "false", + "server.time_until_store_dead": "1m30s", + "version": clusterversion.TestingBinaryVersion.String(), + "cluster.secret": "", + } { + got, ok := last.AlteredSettings[key] + require.True(t, ok, "expected report of altered setting %q", key) + require.Equal(t, expected, got, "expected reported value of setting %q to be %q not %q", key, expected, got) + } + + // Verify that we receive the four auto-populated zone configs plus the two + // modified above, and that their values are as expected. + for _, expectedID := range []int64{ + keys.RootNamespaceID, + keys.LivenessRangesID, + keys.MetaRangesID, + keys.RangeEventTableID, + keys.SystemDatabaseID, + } { + _, ok := last.ZoneConfigs[expectedID] + require.True(t, ok, "didn't find expected ID %d in reported ZoneConfigs: %+v", + expectedID, last.ZoneConfigs) + } + hashedElemName := sql.HashForReporting(clusterSecret, elemName) + hashedZone := sql.HashForReporting(clusterSecret, "zone") + for id, zone := range last.ZoneConfigs { + if id == keys.RootNamespaceID { + require.Equal(t, zone, *rt.server.ExecutorConfig().(sql.ExecutorConfig).DefaultZoneConfig) + } + if id == keys.RangeEventTableID { + require.Equal(t, int32(1), zone.GC.TTLSeconds) + constraints := []zonepb.ConstraintsConjunction{ + { + Constraints: []zonepb.Constraint{ + {Key: hashedZone, Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + {Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + }, + }, + } + require.Equal(t, zone.Constraints, constraints) + } + if id == keys.SystemDatabaseID { + constraints := []zonepb.ConstraintsConjunction{ + { + NumReplicas: 1, + Constraints: []zonepb.Constraint{{Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}}, + }, + { + NumReplicas: 2, + Constraints: []zonepb.Constraint{ + {Key: hashedZone, Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + {Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + }, + }, + } + require.Equal(t, constraints, zone.Constraints) + prefs := []zonepb.LeasePreference{ + { + Constraints: []zonepb.Constraint{ + {Key: hashedZone, Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + {Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}, + }, + }, + { + Constraints: []zonepb.Constraint{{Value: hashedElemName, Type: zonepb.Constraint_REQUIRED}}, + }, + } + require.Equal(t, prefs, zone.LeasePreferences) + } + } +} + +type reporterTest struct { + cloudEnable func() + settings *cluster.Settings + diagServer *diagutils.Server + testingKnobs base.TestingKnobs + serverArgs base.TestServerArgs + server serverutils.TestServerInterface + serverDB *gosql.DB +} + +func (t *reporterTest) Close() { + t.cloudEnable() + t.diagServer.Close() + // stopper will wait for the update/report loop to finish too. + t.server.Stopper().Stop(context.Background()) +} + +func startReporterTest(t *testing.T) *reporterTest { + // Disable cloud info reporting, since it slows down tests. + rt := &reporterTest{ + cloudEnable: cloudinfo.Disable(), + settings: cluster.MakeTestingClusterSettings(), + diagServer: diagutils.NewServer(), + } + + url := rt.diagServer.URL() + rt.testingKnobs = base.TestingKnobs{ + SQLLeaseManager: &lease.ManagerTestingKnobs{ + // Disable SELECT called for delete orphaned leases to keep + // query stats stable. + DisableDeleteOrphanedLeases: true, + }, + Server: &server.TestingKnobs{ + DiagnosticsTestingKnobs: diagnosticspb.TestingKnobs{ + OverrideReportingURL: &url, + }, + }, + } + + storeSpec := base.DefaultTestStoreSpec + storeSpec.Attributes = roachpb.Attributes{Attrs: []string{elemName}} + rt.serverArgs = base.TestServerArgs{ + StoreSpecs: []base.StoreSpec{ + storeSpec, + base.DefaultTestStoreSpec, + }, + Settings: rt.settings, + Locality: roachpb.Locality{ + Tiers: []roachpb.Tier{ + {Key: "region", Value: "east"}, + {Key: "zone", Value: elemName}, + {Key: "state", Value: "ny"}, + {Key: "city", Value: "nyc"}, + }, + }, + Knobs: rt.testingKnobs, + } + rt.server, rt.serverDB, _ = serverutils.StartServer(t, rt.serverArgs) + + // Make sure the test's generated activity is the only activity we measure. + telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ResetCounts) + + return rt +} + +func setupCluster(t *testing.T, db *gosql.DB) { + _, err := db.Exec(`SET CLUSTER SETTING server.time_until_store_dead = '90s'`) + require.NoError(t, err) + + _, err = db.Exec(`SET CLUSTER SETTING diagnostics.reporting.send_crash_reports = false`) + require.NoError(t, err) + + _, err = db.Exec(fmt.Sprintf(`CREATE DATABASE %s`, elemName)) + require.NoError(t, err) + + // Set cluster to an internal testing cluster + q := `SET CLUSTER SETTING cluster.organization = 'Cockroach Labs - Production Testing'` + _, err = db.Exec(q) + require.NoError(t, err) +} + +func verifyEnvironment( + t *testing.T, secret string, locality roachpb.Locality, env *diagnosticspb.Environment, +) { + require.NotEqual(t, 0, env.Hardware.Mem.Total) + require.NotEqual(t, 0, env.Hardware.Mem.Available) + require.Equal(t, int32(runtime.NumCPU()), env.Hardware.Cpu.Numcpu) + require.NotEqual(t, 0, env.Hardware.Cpu.Sockets) + require.NotEqual(t, 0.0, env.Hardware.Cpu.Mhz) + require.NotEqual(t, 0.0, env.Os.Platform) + require.NotEmpty(t, env.Build.Tag) + require.NotEmpty(t, env.Build.Distribution) + require.NotEmpty(t, env.LicenseType) + + require.Equal(t, len(locality.Tiers), len(env.Locality.Tiers)) + for i := range locality.Tiers { + require.Equal(t, sql.HashForReporting(secret, locality.Tiers[i].Key), env.Locality.Tiers[i].Key) + require.Equal(t, sql.HashForReporting(secret, locality.Tiers[i].Value), env.Locality.Tiers[i].Value) + } +} diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index c4c1ce555112..f21d927542ff 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -36,6 +36,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/scheduledjobs" + "github.com/cockroachdb/cockroach/pkg/server/diagnostics" "github.com/cockroachdb/cockroach/pkg/server/serverpb" "github.com/cockroachdb/cockroach/pkg/server/status" "github.com/cockroachdb/cockroach/pkg/sql" @@ -66,6 +67,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/netutil" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" "github.com/marusama/semaphore" "google.golang.org/grpc" @@ -99,6 +101,7 @@ type SQLServer struct { stmtDiagnosticsRegistry *stmtdiagnostics.Registry sqlLivenessProvider sqlliveness.Provider metricsRegistry *metric.Registry + diagnosticsReporter *diagnostics.Reporter } // sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are @@ -210,8 +213,11 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { } execCfg := &sql.ExecutorConfig{} codec := keys.MakeSQLCodec(cfg.SQLConfig.TenantID) - if override := cfg.SQLConfig.TenantIDCodecOverride; override != (roachpb.TenantID{}) { - codec = keys.MakeSQLCodec(override) + if knobs := cfg.TestingKnobs.TenantTestingKnobs; knobs != nil { + override := knobs.(*sql.TenantTestingKnobs).TenantIDCodecOverride + if override != (roachpb.TenantID{}) { + codec = keys.MakeSQLCodec(override) + } } // Create blob service for inter-node file sharing. @@ -636,6 +642,27 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { sqlExecutorTestingKnobs, ) + var reporter *diagnostics.Reporter + if cfg.tenantConnect != nil { + reporter = &diagnostics.Reporter{ + StartTime: timeutil.Now(), + AmbientCtx: &cfg.AmbientCtx, + Config: cfg.BaseConfig.Config, + Settings: cfg.Settings, + ClusterID: cfg.rpcContext.ClusterID.Get, + TenantID: cfg.rpcContext.TenantID, + SQLInstanceID: cfg.nodeIDContainer.SQLInstanceID, + SQLServer: pgServer.SQLServer, + InternalExec: cfg.circularInternalExecutor, + DB: cfg.db, + Recorder: cfg.recorder, + Locality: cfg.Locality, + } + if cfg.TestingKnobs.Server != nil { + reporter.TestingKnobs = &cfg.TestingKnobs.Server.(*TestingKnobs).DiagnosticsTestingKnobs + } + } + return &SQLServer{ stopper: cfg.stopper, sqlIDContainer: cfg.nodeIDContainer, @@ -655,6 +682,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { stmtDiagnosticsRegistry: stmtDiagnosticsRegistry, sqlLivenessProvider: cfg.sqlLivenessProvider, metricsRegistry: cfg.registry, + diagnosticsReporter: reporter, }, nil } @@ -806,3 +834,10 @@ func (s *SQLServer) start( func (s *SQLServer) SQLInstanceID() base.SQLInstanceID { return s.sqlIDContainer.SQLInstanceID() } + +// StartDiagnostics starts periodic diagnostics reporting. +// NOTE: This is not called in start so that it's disabled by default for +// testing. +func (s *SQLServer) StartDiagnostics(ctx context.Context) { + s.diagnosticsReporter.PeriodicallyReportDiagnostics(ctx, s.stopper) +} diff --git a/pkg/server/testserver.go b/pkg/server/testserver.go index ebedc2cc7043..ab726a76ee37 100644 --- a/pkg/server/testserver.go +++ b/pkg/server/testserver.go @@ -53,6 +53,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sqlmigrations" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/cloud" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/ts" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -567,25 +568,56 @@ func makeSQLServerArgs( }, nil } +// TestTenant is an in-memory instantiation of the SQL-only process created for +// each active Cockroach tenant. TestTenant provides tests with access to +// internal methods and state on SQLServer. It is typically started in tests by +// calling the TestServerInterface.StartTenant method or by calling the wrapper +// serverutils.StartTenant method. +type TestTenant struct { + *SQLServer + sqlAddr string + httpAddr string +} + +// SQLAddr is part of the TestTenantInterface interface. +func (t *TestTenant) SQLAddr() string { + return t.sqlAddr +} + +// HTTPAddr is part of the TestTenantInterface interface. +func (t *TestTenant) HTTPAddr() string { + return t.httpAddr +} + +// PGServer is part of the TestTenantInterface interface. +func (t *TestTenant) PGServer() interface{} { + return t.pgServer +} + +// DiagnosticsReporter is part of the TestTenantInterface interface. +func (t *TestTenant) DiagnosticsReporter() interface{} { + return t.diagnosticsReporter +} + // StartTenant starts a SQL tenant communicating with this TestServer. func (ts *TestServer) StartTenant( params base.TestTenantArgs, -) (pgAddr string, httpAddr string, err error) { +) (serverutils.TestTenantInterface, error) { ctx := context.Background() if !params.Existing { if _, err := ts.InternalExecutor().(*sql.InternalExecutor).Exec( ctx, "testserver-create-tenant", nil /* txn */, "SELECT crdb_internal.create_tenant($1)", params.TenantID.ToUint64(), ); err != nil { - return "", "", err + return nil, err } } st := cluster.MakeTestingClusterSettings() sqlCfg := makeTestSQLConfig(st, params.TenantID) sqlCfg.TenantKVAddrs = []string{ts.ServingRPCAddr()} - sqlCfg.TenantIDCodecOverride = params.TenantIDCodecOverride baseCfg := makeTestBaseConfig(st) + baseCfg.TestingKnobs = params.TestingKnobs if params.AllowSettingClusterSettings { baseCfg.TestingKnobs.TenantTestingKnobs = &sql.TenantTestingKnobs{ ClusterSettingsUpdater: st.MakeUpdater(), @@ -595,14 +627,14 @@ func (ts *TestServer) StartTenant( if stopper == nil { stopper = ts.Stopper() } - _, pgAddr, httpAddr, err = StartTenant( + sqlServer, addr, httpAddr, err := StartTenant( ctx, stopper, ts.Cfg.ClusterName, baseCfg, sqlCfg, ) - return pgAddr, httpAddr, err + return &TestTenant{SQLServer: sqlServer, sqlAddr: addr, httpAddr: httpAddr}, err } // StartTenant starts a stand-alone SQL server against a KV backend. @@ -1243,6 +1275,11 @@ func (ts *TestServer) ScratchRange() (roachpb.Key, error) { return scratchKey, nil } +// MetricsRecorder periodically records node-level and store-level metrics. +func (ts *TestServer) MetricsRecorder() *status.MetricsRecorder { + return ts.node.recorder +} + type testServerFactoryImpl struct{} // TestServerFactory can be passed to serverutils.InitTestServerFactory diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 53c2851dfe1c..8d866fd4dd9c 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -870,6 +870,10 @@ type TenantTestingKnobs struct { // in-memory cluster settings. SQL tenants are otherwise prohibited from // setting cluster settings. ClusterSettingsUpdater settings.Updater + + // TenantIDCodecOverride overrides the tenant ID used to construct the SQL + // server's codec, but nothing else (e.g. its certs). Used for testing. + TenantIDCodecOverride roachpb.TenantID } var _ base.ModuleTestingKnobs = &TenantTestingKnobs{} diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index a9a18e07072e..de28f2084ac6 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -1331,10 +1331,15 @@ func (t *logicTest) setup(cfg testClusterConfig, serverArgs TestServerArgs) { connsForClusterSettingChanges := []*gosql.DB{t.cluster.ServerConn(0)} if cfg.useTenant { var err error - t.tenantAddr, _, err = t.cluster.Server(t.nodeIdx).StartTenant(base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10), AllowSettingClusterSettings: true}) + tenantArgs := base.TestTenantArgs{ + TenantID: roachpb.MakeTenantID(10), + AllowSettingClusterSettings: true, + } + tenant, err := t.cluster.Server(t.nodeIdx).StartTenant(tenantArgs) if err != nil { t.rootT.Fatalf("%+v", err) } + t.tenantAddr = tenant.SQLAddr() // Open a connection to this tenant to set any cluster settings specified // by the test config. diff --git a/pkg/sql/run_control_test.go b/pkg/sql/run_control_test.go index 14b8e037a881..b8202b62bf79 100644 --- a/pkg/sql/run_control_test.go +++ b/pkg/sql/run_control_test.go @@ -59,7 +59,7 @@ func makeRunControlTestCases(t *testing.T) ([]runControlTestCase, func()) { testCases[0].conn1 = tc.ServerConn(0).Conn testCases[0].conn2 = tc.ServerConn(1).Conn - tenantDB := serverutils.StartTenant(t, tc.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) + _, tenantDB := serverutils.StartTenant(t, tc.Server(0), base.TestTenantArgs{TenantID: roachpb.MakeTenantID(10)}) testCases[1].name = "Tenant" testCases[1].conn1 = tenantDB.Conn testCases[1].conn2 = tenantDB.Conn diff --git a/pkg/sql/telemetry_test.go b/pkg/sql/telemetry_test.go index 13de95776af7..57e91d73349c 100644 --- a/pkg/sql/telemetry_test.go +++ b/pkg/sql/telemetry_test.go @@ -13,6 +13,7 @@ package sql_test import ( "bytes" "context" + gosql "database/sql" "fmt" "regexp" "sort" @@ -22,7 +23,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/security" "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/server/diagnostics" "github.com/cockroachdb/cockroach/pkg/server/diagnosticspb" "github.com/cockroachdb/cockroach/pkg/sql" "github.com/cockroachdb/cockroach/pkg/sql/catalog/catconstants" @@ -75,125 +78,185 @@ func TestTelemetry(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - ctx := context.Background() // Note: these tests cannot be run in parallel (with each other or with other // tests) because telemetry counters are global. datadriven.Walk(t, "testdata/telemetry", func(t *testing.T, path string) { // Disable cloud info reporting (it would make these tests really slow). defer cloudinfo.Disable()() - diagSrv := diagutils.NewServer() - defer diagSrv.Close() + var test telemetryTest + test.Start(t) + defer test.Close() + + // Run test against physical CRDB cluster. + t.Run("server", func(t *testing.T) { + datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string { + sqlServer := test.server.SQLServer().(*sql.Server) + return test.RunTest(td, test.serverDB, test.server.ReportDiagnostics, sqlServer) + }) + }) - diagSrvURL := diagSrv.URL() - params := base.TestServerArgs{ - Knobs: base.TestingKnobs{ - Server: &server.TestingKnobs{ - DiagnosticsTestingKnobs: diagnosticspb.TestingKnobs{ - OverrideReportingURL: &diagSrvURL, - }, + // Run test against logical tenant cluster. + t.Run("tenant", func(t *testing.T) { + // TODO(andyk): Re-enable these tests once tenant clusters fully + // support the features they're using. + switch path { + case "testdata/telemetry/execution", + "testdata/telemetry/planning", + "testdata/telemetry/sql-stats": + t.Skip("tenant clusters do not support SQL features used by this test") + } + + datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string { + sqlServer := test.server.SQLServer().(*sql.Server) + reporter := test.tenant.DiagnosticsReporter().(*diagnostics.Reporter) + return test.RunTest(td, test.tenantDB, reporter.ReportDiagnostics, sqlServer) + }) + }) + }) +} + +type telemetryTest struct { + t *testing.T + diagSrv *diagutils.Server + server serverutils.TestServerInterface + serverDB *gosql.DB + tenant serverutils.TestTenantInterface + tenantDB *gosql.DB + allowlist featureAllowlist +} + +func (tt *telemetryTest) Start(t *testing.T) { + tt.t = t + tt.diagSrv = diagutils.NewServer() + + diagSrvURL := tt.diagSrv.URL() + params := base.TestServerArgs{ + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + DiagnosticsTestingKnobs: diagnosticspb.TestingKnobs{ + OverrideReportingURL: &diagSrvURL, }, }, - } - s, sqlConn, _ := serverutils.StartServer(t, params) - defer s.Stopper().Stop(ctx) + }, + } + tt.server, tt.serverDB, _ = serverutils.StartServer(tt.t, params) + tt.prepareCluster(tt.serverDB) - runner := sqlutils.MakeSQLRunner(sqlConn) - // Disable automatic reporting so it doesn't interfere with the test. - runner.Exec(t, "SET CLUSTER SETTING diagnostics.reporting.enabled = false") - runner.Exec(t, "SET CLUSTER SETTING diagnostics.reporting.send_crash_reports = false") - // Disable plan caching to get accurate counts if the same statement is - // issued multiple times. - runner.Exec(t, "SET CLUSTER SETTING sql.query_cache.enabled = false") + tt.tenant, tt.tenantDB = serverutils.StartTenant(tt.t, tt.server, base.TestTenantArgs{ + TenantID: roachpb.MakeTenantID(security.EmbeddedTenantIDs()[0]), + AllowSettingClusterSettings: true, + TestingKnobs: params.Knobs, + }) + tt.prepareCluster(tt.tenantDB) +} - var allowlist featureAllowlist - datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string { - switch td.Cmd { - case "exec": - _, err := sqlConn.Exec(td.Input) - if err != nil { - if errors.HasAssertionFailure(err) { - td.Fatalf(t, "%+v", err) - } - return fmt.Sprintf("error: %v\n", err) - } - return "" +func (tt *telemetryTest) Close() { + tt.server.Stopper().Stop(context.Background()) + tt.diagSrv.Close() +} - case "schema": - s.ReportDiagnostics(ctx) - last := diagSrv.LastRequestData() - var buf bytes.Buffer - for i := range last.Schema { - buf.WriteString(formatTableDescriptor(&last.Schema[i])) - } - return buf.String() +func (tt *telemetryTest) RunTest( + td *datadriven.TestData, + db *gosql.DB, + reportDiags func(ctx context.Context), + sqlServer *sql.Server, +) string { + ctx := context.Background() + switch td.Cmd { + case "exec": + _, err := db.Exec(td.Input) + if err != nil { + if errors.HasAssertionFailure(err) { + td.Fatalf(tt.t, "%+v", err) + } + return fmt.Sprintf("error: %v\n", err) + } + return "" - case "feature-allowlist": - var err error - allowlist, err = makeAllowlist(strings.Split(td.Input, "\n")) - if err != nil { - td.Fatalf(t, "error parsing feature regex: %s", err) - } - return "" + case "schema": + reportDiags(ctx) + last := tt.diagSrv.LastRequestData() + var buf bytes.Buffer + for i := range last.Schema { + buf.WriteString(formatTableDescriptor(&last.Schema[i])) + } + return buf.String() - case "feature-usage", "feature-counters": - // Report diagnostics once to reset the counters. - s.ReportDiagnostics(ctx) - _, err := sqlConn.Exec(td.Input) - var buf bytes.Buffer - if err != nil { - fmt.Fprintf(&buf, "error: %v\n", err) - } - s.ReportDiagnostics(ctx) - last := diagSrv.LastRequestData() - usage := last.FeatureUsage - keys := make([]string, 0, len(usage)) - for k, v := range usage { - if v == 0 { - // Ignore zero values (shouldn't happen in practice) - continue - } - if !allowlist.Match(k) { - // Feature key not in allowlist. - continue - } - keys = append(keys, k) - } - sort.Strings(keys) - tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0) - for _, k := range keys { - // Report either just the key or the key and the count. - if td.Cmd == "feature-counters" { - fmt.Fprintf(tw, "%s\t%d\n", k, usage[k]) - } else { - fmt.Fprintf(tw, "%s\n", k) - } - } - _ = tw.Flush() - return buf.String() + case "feature-allowlist": + var err error + tt.allowlist, err = makeAllowlist(strings.Split(td.Input, "\n")) + if err != nil { + td.Fatalf(tt.t, "error parsing feature regex: %s", err) + } + return "" + + case "feature-usage", "feature-counters": + // Report diagnostics once to reset the counters. + reportDiags(ctx) + _, err := db.Exec(td.Input) + var buf bytes.Buffer + if err != nil { + fmt.Fprintf(&buf, "error: %v\n", err) + } + reportDiags(ctx) + last := tt.diagSrv.LastRequestData() + usage := last.FeatureUsage + keys := make([]string, 0, len(usage)) + for k, v := range usage { + if v == 0 { + // Ignore zero values (shouldn't happen in practice) + continue + } + if !tt.allowlist.Match(k) { + // Feature key not in allowlist. + continue + } + keys = append(keys, k) + } + sort.Strings(keys) + tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0) + for _, k := range keys { + // Report either just the key or the key and the count. + if td.Cmd == "feature-counters" { + fmt.Fprintf(tw, "%s\t%d\n", k, usage[k]) + } else { + fmt.Fprintf(tw, "%s\n", k) + } + } + _ = tw.Flush() + return buf.String() - case "sql-stats": - // Report diagnostics once to reset the stats. - s.SQLServer().(*sql.Server).ResetSQLStats(ctx) - s.ReportDiagnostics(ctx) + case "sql-stats": + // Report diagnostics once to reset the stats. + sqlServer.ResetSQLStats(ctx) + reportDiags(ctx) - _, err := sqlConn.Exec(td.Input) - var buf bytes.Buffer - if err != nil { - fmt.Fprintf(&buf, "error: %v\n", err) - } - s.SQLServer().(*sql.Server).ResetSQLStats(ctx) - s.ReportDiagnostics(ctx) - last := diagSrv.LastRequestData() - buf.WriteString(formatSQLStats(last.SqlStats)) - return buf.String() + _, err := db.Exec(td.Input) + var buf bytes.Buffer + if err != nil { + fmt.Fprintf(&buf, "error: %v\n", err) + } + sqlServer.ResetSQLStats(ctx) + reportDiags(ctx) + last := tt.diagSrv.LastRequestData() + buf.WriteString(formatSQLStats(last.SqlStats)) + return buf.String() - default: - td.Fatalf(t, "unknown command %s", td.Cmd) - return "" - } - }) - }) + default: + td.Fatalf(tt.t, "unknown command %s", td.Cmd) + return "" + } +} + +func (tt *telemetryTest) prepareCluster(db *gosql.DB) { + runner := sqlutils.MakeSQLRunner(db) + // Disable automatic reporting so it doesn't interfere with the test. + runner.Exec(tt.t, "SET CLUSTER SETTING diagnostics.reporting.enabled = false") + runner.Exec(tt.t, "SET CLUSTER SETTING diagnostics.reporting.send_crash_reports = false") + // Disable plan caching to get accurate counts if the same statement is + // issued multiple times. + runner.Exec(tt.t, "SET CLUSTER SETTING sql.query_cache.enabled = false") } type featureAllowlist []*regexp.Regexp diff --git a/pkg/testutils/diagutils/diag_test_server.go b/pkg/testutils/diagutils/diag_test_server.go index 50ea01380a54..5445b148b196 100644 --- a/pkg/testutils/diagutils/diag_test_server.go +++ b/pkg/testutils/diagutils/diag_test_server.go @@ -38,6 +38,9 @@ type Server struct { // RequestData stores the data provided by a diagnostics request. type RequestData struct { UUID string + TenantID string + NodeID string + SQLInstanceID string Version string LicenseType string Internal string @@ -64,6 +67,9 @@ func NewServer() *Server { data := &RequestData{ UUID: r.URL.Query().Get("uuid"), + TenantID: r.URL.Query().Get("tenantid"), + NodeID: r.URL.Query().Get("nodeid"), + SQLInstanceID: r.URL.Query().Get("sqlid"), Version: r.URL.Query().Get("version"), LicenseType: r.URL.Query().Get("licensetype"), Internal: r.URL.Query().Get("internal"), diff --git a/pkg/testutils/serverutils/test_server_shim.go b/pkg/testutils/serverutils/test_server_shim.go index edc6f4728f1a..e281d9660283 100644 --- a/pkg/testutils/serverutils/test_server_shim.go +++ b/pkg/testutils/serverutils/test_server_shim.go @@ -29,12 +29,14 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/status" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/uuid" ) // TestServerInterface defines test server functionality that tests need; it is @@ -50,6 +52,10 @@ type TestServerInterface interface { // NodeID returns the ID of this node within its cluster. NodeID() roachpb.NodeID + // ClusterID returns the cluster ID as understood by this node in the + // cluster. + ClusterID() uuid.UUID + // ServingRPCAddr returns the server's advertised address. ServingRPCAddr() string @@ -203,7 +209,7 @@ type TestServerInterface interface { ReportDiagnostics(ctx context.Context) // StartTenant spawns off tenant process connecting to this TestServer. - StartTenant(params base.TestTenantArgs) (pgAddr string, httpAddr string, _ error) + StartTenant(params base.TestTenantArgs) (TestTenantInterface, error) // ScratchRange splits off a range suitable to be used as KV scratch space. // (it doesn't overlap system spans or SQL tables). @@ -211,6 +217,9 @@ type TestServerInterface interface { // Calling this multiple times is undefined (but see // TestCluster.ScratchRange() which is idempotent). ScratchRange() (roachpb.Key, error) + + // MetricsRecorder periodically records node-level and store-level metrics. + MetricsRecorder() *status.MetricsRecorder } // TestServerFactory encompasses the actual implementation of the shim @@ -239,7 +248,8 @@ func StartServer( if err := server.Start(); err != nil { t.Fatalf("%+v", err) } - goDB := OpenDBConn(t, server, params, server.Stopper()) + goDB := OpenDBConn( + t, server.ServingSQLAddr(), params.UseDatabase, params.Insecure, server.Stopper()) return server, goDB, server.DB() } @@ -255,12 +265,12 @@ func NewServer(params base.TestServerArgs) TestServerInterface { // OpenDBConn sets up a gosql DB connection to the given server. func OpenDBConn( - t testing.TB, server TestServerInterface, params base.TestServerArgs, stopper *stop.Stopper, + t testing.TB, sqlAddr string, useDatabase string, insecure bool, stopper *stop.Stopper, ) *gosql.DB { pgURL, cleanupGoDB := sqlutils.PGUrl( - t, server.ServingSQLAddr(), "StartServer" /* prefix */, url.User(security.RootUser)) - pgURL.Path = params.UseDatabase - if params.Insecure { + t, sqlAddr, "StartServer" /* prefix */, url.User(security.RootUser)) + pgURL.Path = useDatabase + if insecure { pgURL.RawQuery = "sslmode=disable" } goDB, err := gosql.Open("postgres", pgURL.String()) @@ -290,27 +300,22 @@ func StartServerRaw(args base.TestServerArgs) (TestServerInterface, error) { // StartTenant starts a tenant SQL server connecting to the supplied test // server. It uses the server's stopper to shut down automatically. However, // the returned DB is for the caller to close. -func StartTenant(t testing.TB, ts TestServerInterface, params base.TestTenantArgs) *gosql.DB { - pgAddr, _, err := ts.StartTenant(params) +func StartTenant( + t testing.TB, ts TestServerInterface, params base.TestTenantArgs, +) (TestTenantInterface, *gosql.DB) { + tenant, err := ts.StartTenant(params) if err != nil { t.Fatal(err) } - pgURL, cleanupGoDB := sqlutils.PGUrl( - t, pgAddr, t.Name() /* prefix */, url.User(security.RootUser)) - - db, err := gosql.Open("postgres", pgURL.String()) - if err != nil { - t.Fatal(err) - } stopper := params.Stopper if stopper == nil { stopper = ts.Stopper() } - stopper.AddCloser(stop.CloserFn(func() { - cleanupGoDB() - })) - return db + + goDB := OpenDBConn( + t, tenant.SQLAddr(), "", false /* insecure */, stopper) + return tenant, goDB } // GetJSONProto uses the supplied client to GET the URL specified by the parameters diff --git a/pkg/testutils/serverutils/test_tenant_shim.go b/pkg/testutils/serverutils/test_tenant_shim.go new file mode 100644 index 000000000000..f5d9d0252da3 --- /dev/null +++ b/pkg/testutils/serverutils/test_tenant_shim.go @@ -0,0 +1,37 @@ +// Copyright 2016 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. +// +// This file provides generic interfaces that allow tests to set up test tenants +// without importing the server package (avoiding circular dependencies). This + +package serverutils + +import "github.com/cockroachdb/cockroach/pkg/base" + +// TestTenantInterface defines SQL-only tenant functionality that tests need; it +// is implemented by server.TestTenant. +type TestTenantInterface interface { + // SQLInstanceID is the ephemeral ID assigned to a running instance of the + // SQLServer. Each tenant can have zero or more running SQLServer instances. + SQLInstanceID() base.SQLInstanceID + + // SQLAddr returns the tenant's SQL address. + SQLAddr() string + + // HTTPAddr returns the tenant's http address. + HTTPAddr() string + + // PGServer returns the tenant's *pgwire.Server as an interface{}. + PGServer() interface{} + + // DiagnosticsReporter returns the tenant's *diagnostics.Reporter as an + // interface{}. + DiagnosticsReporter() interface{} +} diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 7de3b28c9bd6..77931209de96 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -394,7 +394,8 @@ func (tc *TestCluster) StartServer( t.Fatalf("%+v", err) } - dbConn := serverutils.OpenDBConn(t, server, serverArgs, server.Stopper()) + dbConn := serverutils.OpenDBConn( + t, server.ServingSQLAddr(), serverArgs.UseDatabase, serverArgs.Insecure, server.Stopper()) tc.mu.Lock() defer tc.mu.Unlock()