Skip to content

Commit

Permalink
server: rationalize the registration of tenant metrics
Browse files Browse the repository at this point in the history
In case of shared-process tenants, the metrics registry of the tenant's
SQLServer is registered with the node's metrics recorder, so that the
tenant metrics are exported together with the system's metrics. Before
this patch, this registration was done in a weird place - it was done
when preparing the arguments that will be used for ultimately
constructing the respective server. Doing this registration before the
server is even constructed, let alone started, is surprising. It's also
pretty broken, since the tenant start can fail afterwards. This patch
rearranges things so that the registration is done when the tenant
server is started.

I believe this patch also fixes a bug around the exporting of tenant
metrics to Graphite. Before, I think the tenant's metrics were exported
twice - once by the tenant and once by the node recorder.

Release note: None
  • Loading branch information
andreimatei committed Mar 6, 2023
1 parent 2b157b6 commit a4543af
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 25 deletions.
4 changes: 4 additions & 0 deletions pkg/server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,10 @@ type SQLConfig struct {
// LocalKVServerInfo is set in configs for shared-process tenants. It contains
// info for making Batch requests to the local KV server without using gRPC.
LocalKVServerInfo *LocalKVServerInfo

// NodeMetricsRecorder is the node's MetricRecorder; the tenant's metrics will
// be recorded with it. Nil if this is not a shared-process tenant.
NodeMetricsRecorder *status.MetricsRecorder
}

// LocalKVServerInfo is used to group information about the local KV server
Expand Down
8 changes: 6 additions & 2 deletions pkg/server/server_controller_new_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security/clientsecopts"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/server/status"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
Expand Down Expand Up @@ -145,7 +146,7 @@ func (s *Server) startTenantServerInternal(
log.Infof(startCtx, "starting tenant server")

// Now start the tenant proper.
tenantServer, err := NewSharedProcessTenantServer(startCtx, stopper, baseCfg, sqlCfg, s.recorder, tenantNameContainer)
tenantServer, err := NewSharedProcessTenantServer(startCtx, stopper, baseCfg, sqlCfg, tenantNameContainer)
if err != nil {
return nil, err
}
Expand All @@ -172,7 +173,7 @@ func (s *Server) makeSharedProcessTenantConfig(
InternalServer: s.node,
ServerInterceptors: s.grpc.serverInterceptorsInfo,
}
baseCfg, sqlCfg, err := makeSharedProcessTenantServerConfig(ctx, tenantID, index, parentCfg, localServerInfo, stopper)
baseCfg, sqlCfg, err := makeSharedProcessTenantServerConfig(ctx, tenantID, index, parentCfg, localServerInfo, stopper, s.recorder)
if err != nil {
return BaseConfig{}, SQLConfig{}, err
}
Expand All @@ -188,6 +189,7 @@ func makeSharedProcessTenantServerConfig(
kvServerCfg Config,
kvServerInfo LocalKVServerInfo,
stopper *stop.Stopper,
nodeMetricsRecorder *status.MetricsRecorder,
) (baseCfg BaseConfig, sqlCfg SQLConfig, err error) {
st := cluster.MakeClusterSettings()

Expand Down Expand Up @@ -356,6 +358,8 @@ func makeSharedProcessTenantServerConfig(
// reach this KV layer without going through the network.
sqlCfg.LocalKVServerInfo = &kvServerInfo

sqlCfg.NodeMetricsRecorder = nodeMetricsRecorder

return baseCfg, sqlCfg, nil
}

Expand Down
44 changes: 21 additions & 23 deletions pkg/server/tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func NewSeparateProcessTenantServer(
tenantNameContainer *roachpb.TenantNameContainer,
) (*SQLServerWrapper, error) {
instanceIDContainer := baseCfg.IDContainer.SwitchToSQLIDContainerForStandaloneSQLInstance()
return newTenantServer(ctx, stopper, baseCfg, sqlCfg, nil /* parentRecorder */, tenantNameContainer, instanceIDContainer)
return newTenantServer(ctx, stopper, baseCfg, sqlCfg, tenantNameContainer, instanceIDContainer)
}

// NewSharedProcessTenantServer creates a tenant-specific, SQL-only
Expand All @@ -180,14 +180,13 @@ func NewSharedProcessTenantServer(
stopper *stop.Stopper,
baseCfg BaseConfig,
sqlCfg SQLConfig,
parentRecorder *status.MetricsRecorder,
tenantNameContainer *roachpb.TenantNameContainer,
) (*SQLServerWrapper, error) {
if baseCfg.IDContainer.Get() == 0 {
return nil, errors.AssertionFailedf("programming error: NewSharedProcessTenantServer called before NodeID was assigned.")
}
instanceIDContainer := base.NewSQLIDContainerForNode(baseCfg.IDContainer)
return newTenantServer(ctx, stopper, baseCfg, sqlCfg, parentRecorder, tenantNameContainer, instanceIDContainer)
return newTenantServer(ctx, stopper, baseCfg, sqlCfg, tenantNameContainer, instanceIDContainer)
}

// newTenantServer constructs a SQLServerWrapper.
Expand All @@ -198,7 +197,6 @@ func newTenantServer(
stopper *stop.Stopper,
baseCfg BaseConfig,
sqlCfg SQLConfig,
parentRecorder *status.MetricsRecorder,
tenantNameContainer *roachpb.TenantNameContainer,
instanceIDContainer *base.SQLIDContainer,
) (*SQLServerWrapper, error) {
Expand All @@ -212,7 +210,7 @@ func newTenantServer(
// Inform the server identity provider that we're operating
// for a tenant server.
baseCfg.idProvider.SetTenant(sqlCfg.TenantID)
args, err := makeTenantSQLServerArgs(ctx, stopper, baseCfg, sqlCfg, parentRecorder, tenantNameContainer, instanceIDContainer)
args, err := makeTenantSQLServerArgs(ctx, stopper, baseCfg, sqlCfg, tenantNameContainer, instanceIDContainer)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -618,6 +616,24 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
s.sqlServer.cfg.HTTPAdvertiseAddr,
s.sqlServer.cfg.SQLAdvertiseAddr,
)
// If there's a higher-level recorder, we link our metrics registry to it.
if s.sqlCfg.NodeMetricsRecorder != nil {
s.sqlCfg.NodeMetricsRecorder.AddTenantRegistry(s.sqlCfg.TenantID, s.registry)
// TODO(aadityasondhi): Unlink the tenant registry if the tenant is stopped.
// See #97889.
} else {
// Export statistics to graphite, if enabled by configuration. We only do
// this if there isn't a higher-level recorder; if there is, that one takes
// responsibility for exporting to Graphite.
var graphiteOnce sync.Once
graphiteEndpoint.SetOnChange(&s.ClusterSettings().SV, func(context.Context) {
if graphiteEndpoint.Get(&s.ClusterSettings().SV) != "" {
graphiteOnce.Do(func() {
startGraphiteStatsExporter(workersCtx, s.stopper, s.recorder, s.ClusterSettings())
})
}
})
}

if !s.sqlServer.cfg.DisableRuntimeStatsMonitor {
// Begin recording runtime statistics.
Expand All @@ -634,16 +650,6 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
}
}

// Export statistics to graphite, if enabled by configuration.
var graphiteOnce sync.Once
graphiteEndpoint.SetOnChange(&s.ClusterSettings().SV, func(context.Context) {
if graphiteEndpoint.Get(&s.ClusterSettings().SV) != "" {
graphiteOnce.Do(func() {
startGraphiteStatsExporter(workersCtx, s.stopper, s.recorder, s.ClusterSettings())
})
}
})

// After setting modeOperational, we can block until all stores are fully
// initialized.
s.grpc.setMode(modeOperational)
Expand Down Expand Up @@ -886,7 +892,6 @@ func makeTenantSQLServerArgs(
stopper *stop.Stopper,
baseCfg BaseConfig,
sqlCfg SQLConfig,
parentRecorder *status.MetricsRecorder,
tenantNameContainer *roachpb.TenantNameContainer,
instanceIDContainer *base.SQLIDContainer,
) (sqlServerArgs, error) {
Expand Down Expand Up @@ -1070,13 +1075,6 @@ func makeTenantSQLServerArgs(
recorder := status.NewMetricsRecorder(
sqlCfg.TenantID, tenantNameContainer, nil /* nodeLiveness */, nil, /* remoteClocks */
clock.WallClock(), st)
// Note: If the tenant is in-process, we attach this tenant's metric
// registry to the parentRecorder held by the system tenant. This
// ensures that generated Prometheus metrics from the system tenant
// include metrics from this in-process tenant.
if parentRecorder != nil {
parentRecorder.AddTenantRegistry(sqlCfg.TenantID, registry)
}

var runtime *status.RuntimeStatSampler
if baseCfg.RuntimeStatSampler != nil {
Expand Down

0 comments on commit a4543af

Please sign in to comment.