Skip to content

Commit

Permalink
distsql: query count metrics
Browse files Browse the repository at this point in the history
This adds metrics for counts of active DistSQL queries and flows, which
will give some visibility into the status of long-running queries.
  • Loading branch information
couchand committed Jul 17, 2017
1 parent 8a21a56 commit 162af2a
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 8 deletions.
15 changes: 10 additions & 5 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
)
rootSQLMemoryMonitor.Start(context.Background(), nil, mon.MakeStandaloneBudget(s.cfg.SQLMemoryPoolSize))

distSQLMetrics := sql.MakeMemMetrics("distsql", cfg.HistogramWindowInterval())
s.registry.AddMetric(distSQLMetrics.CurBytesCount)
s.registry.AddMetric(distSQLMetrics.MaxBytesHist)
distSQLMemMetrics := sql.MakeMemMetrics("distsql", cfg.HistogramWindowInterval())
s.registry.AddMetric(distSQLMemMetrics.CurBytesCount)
s.registry.AddMetric(distSQLMemMetrics.MaxBytesHist)

// Set up the DistSQL temp engine.

Expand Down Expand Up @@ -309,6 +309,9 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
}
}

distSQLMetrics := distsqlrun.MakeDistSQLMetrics()
s.registry.AddMetricStruct(distSQLMetrics)

// Set up the DistSQL server.
distSQLCfg := distsqlrun.ServerConfig{
AmbientContext: s.cfg.AmbientCtx,
Expand All @@ -322,8 +325,10 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
TempStorage: tempEngine,

ParentMemoryMonitor: &rootSQLMemoryMonitor,
Counter: distSQLMetrics.CurBytesCount,
Hist: distSQLMetrics.MaxBytesHist,
Counter: distSQLMemMetrics.CurBytesCount,
Hist: distSQLMemMetrics.MaxBytesHist,

Metrics: &distSQLMetrics,
}
if s.cfg.TestingKnobs.DistSQL != nil {
distSQLCfg.TestingKnobs = *s.cfg.TestingKnobs.DistSQL.(*distsqlrun.TestingKnobs)
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/distsql_running.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ func (dsp *distSQLPlanner) Run(

log.VEvent(ctx, 1, "running DistSQL plan")

dsp.distSQLSrv.ServerConfig.Metrics.QueryStart()
defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop()

recv.resultToStreamColMap = plan.planToStreamColMap
thisNodeID := dsp.nodeDesc.NodeID

Expand Down
8 changes: 7 additions & 1 deletion pkg/sql/distsqlrun/flow_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type flowScheduler struct {
log.AmbientContext
stopper *stop.Stopper
flowDoneCh chan *Flow
metrics *DistSQLMetrics

mu struct {
syncutil.Mutex
Expand All @@ -52,11 +53,14 @@ type flowWithCtx struct {
flow *Flow
}

func newFlowScheduler(ambient log.AmbientContext, stopper *stop.Stopper) *flowScheduler {
func newFlowScheduler(
ambient log.AmbientContext, stopper *stop.Stopper, metrics *DistSQLMetrics,
) *flowScheduler {
fs := &flowScheduler{
AmbientContext: ambient,
stopper: stopper,
flowDoneCh: make(chan *Flow, flowDoneChanSize),
metrics: metrics,
}
fs.mu.queue = list.New()
return fs
Expand All @@ -71,6 +75,7 @@ func (fs *flowScheduler) canRunFlow(_ *Flow) bool {
// runFlowNow starts the given flow; does not wait for the flow to complete.
func (fs *flowScheduler) runFlowNow(ctx context.Context, f *Flow) {
fs.mu.numRunning++
fs.metrics.FlowStart()
f.Start(ctx, func() { fs.flowDoneCh <- f })
// TODO(radu): we could replace the WaitGroup with a structure that keeps a
// refcount and automatically runs Cleanup() when the count reaches 0.
Expand Down Expand Up @@ -116,6 +121,7 @@ func (fs *flowScheduler) Start() {
case <-fs.flowDoneCh:
fs.mu.Lock()
fs.mu.numRunning--
fs.metrics.FlowStop()
if !stopped {
if frElem := fs.mu.queue.Front(); frElem != nil {
n := frElem.Value.(*flowWithCtx)
Expand Down
80 changes: 80 additions & 0 deletions pkg/sql/distsqlrun/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2016 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
//
// Author: Andrew Dona-Couch ([email protected])

package distsqlrun

import (
"github.com/cockroachdb/cockroach/pkg/util/metric"
)

// DistSQLMetrics contains pointers to the metrics for
// monitoring DistSQL processing.
type DistSQLMetrics struct {
QueriesActive *metric.Gauge
QueriesTotal *metric.Counter
FlowsActive *metric.Gauge
FlowsTotal *metric.Counter
}

// MetricStruct implements the metrics.Struct interface.
func (DistSQLMetrics) MetricStruct() {}

var _ metric.Struct = DistSQLMetrics{}

// MakeDistSQLMetrics instantiates the metrics holder for DistSQL monitoring.
func MakeDistSQLMetrics() DistSQLMetrics {
metaQueriesActive := metric.Metadata{
Name: "sql.distsql.queries.active",
Help: "Number of currently active DistSQL queries"}
metaQueriesTotal := metric.Metadata{
Name: "sql.distsql.queries.total",
Help: "Number of DistSQL queries executed"}
metaFlowsActive := metric.Metadata{
Name: "sql.distsql.flows.active",
Help: "Number of currently active DistSQL flows"}
metaFlowsTotal := metric.Metadata{
Name: "sql.distsql.flows.total",
Help: "Number of DistSQL flows executed"}

return DistSQLMetrics{
QueriesActive: metric.NewGauge(metaQueriesActive),
QueriesTotal: metric.NewCounter(metaQueriesTotal),
FlowsActive: metric.NewGauge(metaFlowsActive),
FlowsTotal: metric.NewCounter(metaFlowsTotal),
}
}

// QueryStart registers the start of a new DistSQL query.
func (m *DistSQLMetrics) QueryStart() {
m.QueriesActive.Inc(1)
m.QueriesTotal.Inc(1)
}

// QueryStop registers the end of a DistSQL query.
func (m *DistSQLMetrics) QueryStop() {
m.QueriesActive.Dec(1)
}

// FlowStart registers the start of a new DistSQL flow.
func (m *DistSQLMetrics) FlowStart() {
m.FlowsActive.Inc(1)
m.FlowsTotal.Inc(1)
}

// FlowStop registers the end of a DistSQL flow.
func (m *DistSQLMetrics) FlowStop() {
m.FlowsActive.Dec(1)
}
4 changes: 3 additions & 1 deletion pkg/sql/distsqlrun/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ type ServerConfig struct {
// cockroach node does not have an engine for temporary storage.
TempStorage engine.Engine

Metrics *DistSQLMetrics

// NodeID is the id of the node on which this Server is running.
NodeID *base.NodeIDContainer
ClusterID uuid.UUID
Expand Down Expand Up @@ -139,7 +141,7 @@ func NewServer(ctx context.Context, cfg ServerConfig) *ServerImpl {
ServerConfig: cfg,
regexpCache: parser.NewRegexpCache(512),
flowRegistry: makeFlowRegistry(),
flowScheduler: newFlowScheduler(cfg.AmbientContext, cfg.Stopper),
flowScheduler: newFlowScheduler(cfg.AmbientContext, cfg.Stopper, cfg.Metrics),
memMonitor: mon.MakeMonitor("distsql",
cfg.Counter, cfg.Hist, -1 /* increment: use default block size */, noteworthyMemoryUsageBytes),
tempStorage: cfg.TempStorage,
Expand Down
2 changes: 1 addition & 1 deletion pkg/ui/embedded.go

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/sql.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,45 @@ export default function (props: GraphDashboardProps) {
</Axis>
</LineGraph>,

<LineGraph
title="DistSQL Queries Active"
sources={nodeSources}
tooltip="The total number of active DistSQL queries on all nodes."
>
<Axis>
<Metric name="cr.node.sql.distsql.queries.active" title="Queries Active" />
</Axis>
</LineGraph>,

<LineGraph
title="DistSQL Flows Active, total"
sources={nodeSources}
tooltip="The total number of active DistSQL flows on all nodes."
>
<Axis>
<Metric name="cr.node.sql.distsql.flows.active" title="Flows Active" />
</Axis>
</LineGraph>,

<LineGraph
title="DistSQL Flows Active, by node"
sources={nodeSources}
tooltip="The number of DistSQL flows running on each node."
>
<Axis>
{
_.map(nodeIDs, (node) => (
<Metric
key={node}
name="cr.node.sql.distsql.flows.active"
title={nodeAddress(nodesSummary, node)}
sources={[node]}
/>
))
}
</Axis>
</LineGraph>,

<LineGraph
title="Service Latency: SQL, 99th percentile"
tooltip={(
Expand Down

0 comments on commit 162af2a

Please sign in to comment.